doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn

   1 [[!meta title="ikiwiki-wordpress-import"]]
   2
   3 I modified the script a bit so categories and tags would actually show up in the output file.
   4
   5 -----
   6 <pre>
   7 #!/usr/bin/env python
   8
   9 """
  10     Purpose:
  11     Wordpress-to-Ikiwiki import tool
  12
  13     Copyright:
  14     Copyright (C) 2007  Chris Lamb <chris@chris-lamb.co.uk>
  15
  16     This program is free software: you can redistribute it and/or modify
  17     it under the terms of the GNU General Public License as published by
  18     the Free Software Foundation, either version 3 of the License, or
  19     (at your option) any later version.
  20
  21     This program is distributed in the hope that it will be useful,
  22     but WITHOUT ANY WARRANTY; without even the implied warranty of
  23     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  24     GNU General Public License for more details.
  25
  26     You should have received a copy of the GNU General Public License
  27     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  28
  29     Usage: run --help as an argument with this script.
  30
  31     Notes:
  32     I added some extra bits to include the \[[!tag foo]] stuff in the post,
  33     as it wasn't before, at all. I'll diff the versions out so you can see
  34     the mess I made :).
  35
  36 """
  37
  38 import os, sys
  39 import time
  40 import re
  41
  42 from BeautifulSoup import BeautifulSoup
  43
  44 import codecs, htmlentitydefs
  45
  46 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
  47     % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
  48
  49 def main(name, email, subdir, branch='master'):
  50     soup = BeautifulSoup(sys.stdin.read())
  51
  52     # Regular expression to match stub in URL.
  53     stub_pattern = re.compile(r'.*\/(.+)\/$')
  54
  55     for x in soup.findAll('item'):
  56         # Ignore draft posts
  57         if x.find('wp:status').string != 'publish': continue
  58
  59         match = stub_pattern.match(x.guid.string)
  60         if match:
  61             stub = match.groups()[0]
  62         else:
  63             # Fall back to our own stubs
  64             stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
  65
  66         commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
  67         timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
  68
  69         content = '\[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
  70         content += x.find('content:encoded').string.replace('\r\n', '\n')
  71
  72         # categories = x.findAll('category')
  73         # categories = x.findAll({'category':True}, attrs={'domain':re.compile(('category|tag'))})
  74         # categories = x.findAll({'category':True}, domain=["category", "tag"])
  75         # categories = x.findAll({'category':True}, nicename=True)
  76         """
  77         We do it differently here because we have duplicates otherwise.
  78         Take a look:
  79         &lt;category&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
  80         &lt;category domain="category" nicename="health"&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
  81
  82         If we do the what original did, we end up with all tags and cats doubled.
  83         Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
  84         I'd much rather have the value of 'nicename', and tried, but my
  85         python skillz are extremely limited....
  86         """
  87         categories = x.findAll('category', nicename=True)
  88         if categories:
  89             content += "\n"
  90             for cat in categories:
  91                 # remove 'tags/' because we have a 'tagbase' set.
  92                 # your choice: 'tag', or 'taglink'
  93                 # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
  94                 content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
  95                 # print >>sys.stderr, cat.string.replace(' ', '-')
  96
  97         # moved this thing down
  98         data = content.encode('ascii', 'html_replace')
  99         print "commit refs/heads/%s" % branch
 100         print "committer %s &lt;%s&gt; %d +0000" % (name, email, timestamp)
 101         print "data %d" % len(commit_msg)
 102         print commit_msg
 103         print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
 104         print "data %d" % len(data)
 105         print data
 106
 107 if __name__ == "__main__":
 108     if len(sys.argv) not in (4, 5):
 109         print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
 110     else:
 111         main(*sys.argv[1:])
 112
 113 </pre>
 114 -----
 115
 116 I have another version of the script, which uses the `timestamp` from the script, and inserts that as a \[[!meta date="foodate"]]. I'm posting it here just in case I happen to be doing something to the httpd.
 117
 118 (Hopefully I've escaped everything properly; if I missed something, check the source.)
 119
 120 -----
 121 <pre>
 122 #!/usr/bin/env python
 123
 124 """
 125     Purpose:
 126     Wordpress-to-Ikiwiki import tool
 127
 128     Copyright:
 129     Copyright (C) 2007  Chris Lamb <chris@chris-lamb.co.uk>
 130
 131     This program is free software: you can redistribute it and/or modify
 132     it under the terms of the GNU General Public License as published by
 133     the Free Software Foundation, either version 3 of the License, or
 134     (at your option) any later version.
 135
 136     This program is distributed in the hope that it will be useful,
 137     but WITHOUT ANY WARRANTY; without even the implied warranty of
 138     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 139     GNU General Public License for more details.
 140
 141     You should have received a copy of the GNU General Public License
 142     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 143
 144     Usage: run --help as an argument with this script.
 145
 146     Notes:
 147     I added some extra bits to include the \[[!tag foo]] stuff in the post,
 148     as it wasn't before, at all. I'll diff the versions out so you can see
 149     the mess I made :).
 150
 151 """
 152
 153 import os, sys
 154 import time
 155 import re
 156
 157 from datetime import datetime
 158 from BeautifulSoup import BeautifulSoup
 159
 160 import codecs, htmlentitydefs
 161
 162 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
 163     % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
 164
 165 def main(name, email, subdir, branch='master'):
 166     soup = BeautifulSoup(sys.stdin.read())
 167
 168     # Regular expression to match stub in URL.
 169     stub_pattern = re.compile(r'.*\/(.+)\/$')
 170
 171     for x in soup.findAll('item'):
 172         # Ignore draft posts
 173         if x.find('wp:status').string != 'publish': continue
 174
 175         match = stub_pattern.match(x.guid.string)
 176         if match:
 177             stub = match.groups()[0]
 178         else:
 179             # Fall back to our own stubs
 180             stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
 181
 182         commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
 183         timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
 184         content = '\[[!meta title="%s"]]\n' % (x.title.string.replace('"', r'\"'))
 185         content += "\[[!meta date=\"%s\"]]\n" % datetime.fromtimestamp(timestamp)
 186         content += x.find('content:encoded').string.replace('\r\n', '\n')
 187
 188         """
 189         We do it differently here because we have duplicates otherwise.
 190         Take a look:
 191         &lt;category&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
 192         &lt;category domain="category" nicename="health"&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
 193
 194         If we do the what original did, we end up with all tags and cats doubled.
 195         Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
 196         I'd much rather have the value of 'nicename', and tried, but my
 197         python skillz are extremely limited....
 198         """
 199         categories = x.findAll('category', nicename=True)
 200         if categories:
 201             content += "\n"
 202             for cat in categories:
 203                 # remove 'tags/' because we have a 'tagbase' set.
 204                 # your choice: 'tag', or 'taglink'
 205                 # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
 206                 content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
 207                 # this is just debugging, and for fun
 208                 # print >>sys.stderr, cat.string.replace(' ', '-')
 209
 210         # moved this thing down
 211         data = content.encode('ascii', 'html_replace')
 212         print "commit refs/heads/%s" % branch
 213         print "committer %s &lt;%s&gt; %d +0000" % (name, email, timestamp)
 214         print "data %d" % len(commit_msg)
 215         print commit_msg
 216         print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
 217         print "data %d" % len(data)
 218         print data
 219
 220 if __name__ == "__main__":
 221     if len(sys.argv) not in (4, 5):
 222         print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
 223     else:
 224         main(*sys.argv[1:])
 225
 226 </pre>
 227 -----
 228
 229
 230 [[!tag wordpress]]
 231 [[!tag python]]
 232 [[!tag conversion]]
 233 [[!tag ikiwiki]]