1 [[!meta title="ikiwiki-wordpress-import"]]
3 I modified the script a bit so categories and tags would actually show up in the output file.
11 Wordpress-to-Ikiwiki import tool
14 Copyright (C) 2007 Chris Lamb <chris@chris-lamb.co.uk>
16 This program is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or
19 (at your option) any later version.
21 This program is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
26 You should have received a copy of the GNU General Public License
27 along with this program. If not, see <http://www.gnu.org/licenses/>.
29 Usage: run --help as an argument with this script.
32 I added some extra bits to include the [[!tag foo]] stuff in the post,
33 as it wasn't before, at all. I'll diff the versions out so you can see
42 from BeautifulSoup import BeautifulSoup
44 import codecs, htmlentitydefs
46 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
47 % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
49 def main(name, email, subdir, branch='master'):
50 soup = BeautifulSoup(sys.stdin.read())
52 # Regular expression to match stub in URL.
53 stub_pattern = re.compile(r'.*\/(.+)\/$')
55 for x in soup.findAll('item'):
57 if x.find('wp:status').string != 'publish': continue
59 match = stub_pattern.match(x.guid.string)
61 stub = match.groups()[0]
63 # Fall back to our own stubs
64 stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
66 commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
67 timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
69 content = '[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
70 content += x.find('content:encoded').string.replace('\r\n', '\n')
72 # categories = x.findAll('category')
73 # categories = x.findAll({'category':True}, attrs={'domain':re.compile(('category|tag'))})
74 # categories = x.findAll({'category':True}, domain=["category", "tag"])
75 # categories = x.findAll({'category':True}, nicename=True)
77 We do it differently here because we have duplicates otherwise.
79 <category><![CDATA[Health]]></category>
80 <category domain="category" nicename="health"><![CDATA[Health]]></category>
82 If we do the what original did, we end up with all tags and cats doubled.
83 Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
84 I'd much rather have the value of 'nicename', and tried, but my
85 python skillz are extremely limited....
87 categories = x.findAll('category', nicename=True)
90 for cat in categories:
91 # remove 'tags/' because we have a 'tagbase' set.
92 # your choice: 'tag', or 'taglink'
93 # content += "\n[[!tag %s]]" % (cat.string.replace(' ', '-'))
94 content += "\n[[!taglink %s]]" % (cat.string.replace(' ', '-'))
95 # print >>sys.stderr, cat.string.replace(' ', '-')
97 # moved this thing down
98 data = content.encode('ascii', 'html_replace')
99 print "commit refs/heads/%s" % branch
100 print "committer %s <%s> %d +0000" % (name, email, timestamp)
101 print "data %d" % len(commit_msg)
103 print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
104 print "data %d" % len(data)
107 if __name__ == "__main__":
108 if len(sys.argv) not in (4, 5):
109 print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])