1 [[!meta title="ikiwiki-wordpress-import"]]
3 I converted the script to Perl. The new version gets your name and email automatically from your git config, converts the body of your posts to markdown, and also imports comments. More importantly it works with the latest wordpress, which the python version does not. Note that it's still not 100% perfect and I intend to make a few modifications still, but they will require access to the mysql database and that may render the script useless to some users.
13 use DateTime::Format::Strptime;
14 use HTML::WikiConverter;
17 use Digest::MD5 'md5_hex';
19 die "usage: $0 import_file subdir [branch] | git-fast-import"
20 unless @ARGV == 2 or @ARGV == 3;
22 chomp(my $name = qx(git config --get user.name));
23 chomp(my $email = qx(git config --get user.email));
25 my ($file, $subdir, $branch) = @ARGV;
30 for my $x (grep $_->{'wp:status'} eq 'publish', @{XMLin($file)->{channel}{item}}) {
31 state $date_parser = DateTime::Format::Strptime->new(
36 my $stub = $x =~ m<([^/]+)\/$>
38 : lc($x->{title} =~ s/\W/-/gr =~ s/-$//r)
41 my $guid = $x->{guid}{content} || $x->{link};
42 utf8::encode($x->{title});
43 my $msg = qq($x->{title}\n\nfrom WordPress [$guid]);
44 my $timestamp = $date_parser
45 ->parse_datetime($x->{'wp:post_date_gmt'})
48 my $c = $x->{category};
49 $c = [$c] if ref $c && ref $c ne 'ARRAY';
52 sprintf(qq([[!meta title="%s"]]\n), $x->{title} =~ s/"/\\"/gr) .
53 convert_content($x->{'content:encoded'}) . "\n\n" .
55 map '[[!tag ' . s/ /-/r . ']]',
59 grep $_ ne 'uncategorized',
66 $events{$timestamp} = join "\n",
67 "commit refs/heads/$branch",
68 "committer $name <$email> $timestamp +0000",
72 "M 644 inline $subdir/$stub.mdwn",
78 get_comments($x->{link}, "$subdir/$stub")
79 if $x->{'wp:post_type'} eq 'post'
85 state $ua = LWP::UserAgent->new;
87 my $content = $ua->get("$url/feed")->decoded_content;
91 try { XMLin($content, ForceArray => ['item']) }
97 for my $x (@{$decoded->{channel}{item}}) {
98 my $date = $x->{pubDate};
116 state $date_parser = DateTime::Format::Strptime->new(
117 pattern => '%d %m %Y %T',
121 my $datetime = $date_parser
122 ->parse_datetime($date);
124 my $timestamp = $datetime->epoch;
125 my $formatted_date = "$timestamp";
127 my $msg = 'Added a comment';
128 my $content = convert_content($x->{'content:encoded'});
129 utf8::encode($x->{'dc:creator'});
131 $events{$timestamp} = join "\n",
132 "commit refs/heads/$branch",
133 # still need to get email address
134 "committer $x->{'dc:creator'} <$x->{'dc:creator'}> $timestamp +0000",
138 "M 644 inline " . unique_comment_location($dir, $content),
142 [[!comment format=mdwn
143 username="$x->{'dc:creator'}"
144 date="$formatted_date"
154 say $events{$_} for sort keys %events;
156 sub convert_content {
161 state $converter = HTML::WikiConverter->new(
162 dialect => 'Markdown',
163 link_style => 'inline',
164 unordered_list_style => 'dash',
165 image_style => 'inline',
166 image_tag_fallback => 0,
169 # I know I know you can't parse XML with regular expressions. Go find a real
170 # parser and send me a patch
173 my $start_code = qr(<pre[^>]*>);
174 # (?:) is a no op but keeps ikiwiki from breaking my script
175 my $end_code = qr(</p(?:)re>);
177 $body =~ s(&#(?:8217|039);)(')g;
178 $body =~ s(&(?:quot|#822[01]);)(")g;
179 $body =~ s(<)(<)g;
180 $body =~ s(>)(>)g;
181 $body =~ s(&)(&)g;
182 $body =~ s(…)(...)g;
183 $body =~ s(̵[12];)(-)g;
184 $body =~ s(‘)(')g;
185 $body =~ s(′)(')g;
186 $body =~ s(∞)(∞)g;
187 $body =~ s( )()g;
188 $body =~ s(<code[^>]*>)(<p(?:)re>)g;
189 $body =~ s(</c(?:)ode>)(</p(?:)re>)g;
192 map {; split qr[(?=<p(?:)re>)] }
193 map {; split qr[</p(?:)re>\K] }
198 for my $t (@tokens) {
200 ($in_code && $t !~ $end_code) ||
201 ($t =~ $start_code && $t =~ $end_code)
204 } elsif ($t =~ $start_code) {
206 } elsif ($t =~ $end_code) {
209 die "$t !!! '$1'" if $t =~ m/&([^;\s]+);/ && $1 !~ /[lg]t/;
216 $converter->html2wiki(join "\n\n", @new_tokens)
219 sub unique_comment_location {
220 my ($dir, $content) = @_;
222 utf8::encode($content);
223 my $md5 = md5_hex($content);
229 $location = "$dir/comment_${i}_$md5._comment";
230 } while -e $location;
238 I modified the script a bit so categories and tags would actually show up in the output file.
242 #!/usr/bin/env python
246 Wordpress-to-Ikiwiki import tool
249 Copyright (C) 2007 Chris Lamb <chris@chris-lamb.co.uk>
251 This program is free software: you can redistribute it and/or modify
252 it under the terms of the GNU General Public License as published by
253 the Free Software Foundation, either version 3 of the License, or
254 (at your option) any later version.
256 This program is distributed in the hope that it will be useful,
257 but WITHOUT ANY WARRANTY; without even the implied warranty of
258 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
259 GNU General Public License for more details.
261 You should have received a copy of the GNU General Public License
262 along with this program. If not, see <http://www.gnu.org/licenses/>.
264 Usage: run --help as an argument with this script.
267 I added some extra bits to include the \[[!tag foo]] stuff in the post,
268 as it wasn't before, at all. I'll diff the versions out so you can see
277 from BeautifulSoup import BeautifulSoup
279 import codecs, htmlentitydefs
281 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
282 % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
284 def main(name, email, subdir, branch='master'):
285 soup = BeautifulSoup(sys.stdin.read())
287 # Regular expression to match stub in URL.
288 stub_pattern = re.compile(r'.*\/(.+)\/$')
290 for x in soup.findAll('item'):
292 if x.find('wp:status').string != 'publish': continue
294 match = stub_pattern.match(x.guid.string)
296 stub = match.groups()[0]
298 # Fall back to our own stubs
299 stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
301 commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
302 timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
304 content = '\[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
305 content += x.find('content:encoded').string.replace('\r\n', '\n')
307 # categories = x.findAll('category')
308 # categories = x.findAll({'category':True}, attrs={'domain':re.compile(('category|tag'))})
309 # categories = x.findAll({'category':True}, domain=["category", "tag"])
310 # categories = x.findAll({'category':True}, nicename=True)
312 We do it differently here because we have duplicates otherwise.
314 <category><![CDATA[Health]]></category>
315 <category domain="category" nicename="health"><![CDATA[Health]]></category>
317 If we do the what original did, we end up with all tags and cats doubled.
318 Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
319 I'd much rather have the value of 'nicename', and tried, but my
320 python skillz are extremely limited....
322 categories = x.findAll('category', nicename=True)
325 for cat in categories:
326 # remove 'tags/' because we have a 'tagbase' set.
327 # your choice: 'tag', or 'taglink'
328 # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
329 content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
330 # print >>sys.stderr, cat.string.replace(' ', '-')
332 # moved this thing down
333 data = content.encode('ascii', 'html_replace')
334 print "commit refs/heads/%s" % branch
335 print "committer %s <%s> %d +0000" % (name, email, timestamp)
336 print "data %d" % len(commit_msg)
338 print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
339 print "data %d" % len(data)
342 if __name__ == "__main__":
343 if len(sys.argv) not in (4, 5):
344 print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
351 I have another version of the script, which uses the `timestamp` from the script, and inserts that as a \[[!meta date="foodate"]]. I'm posting it here just in case I happen to be doing something to the httpd.
353 (Hopefully I've escaped everything properly; if I missed something, check the source.)
357 #!/usr/bin/env python
361 Wordpress-to-Ikiwiki import tool
364 Copyright (C) 2007 Chris Lamb <chris@chris-lamb.co.uk>
366 This program is free software: you can redistribute it and/or modify
367 it under the terms of the GNU General Public License as published by
368 the Free Software Foundation, either version 3 of the License, or
369 (at your option) any later version.
371 This program is distributed in the hope that it will be useful,
372 but WITHOUT ANY WARRANTY; without even the implied warranty of
373 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
374 GNU General Public License for more details.
376 You should have received a copy of the GNU General Public License
377 along with this program. If not, see <http://www.gnu.org/licenses/>.
379 Usage: run --help as an argument with this script.
382 I added some extra bits to include the \[[!tag foo]] stuff in the post,
383 as it wasn't before, at all. I'll diff the versions out so you can see
392 from datetime import datetime
393 from BeautifulSoup import BeautifulSoup
395 import codecs, htmlentitydefs
397 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
398 % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
400 def main(name, email, subdir, branch='master'):
401 soup = BeautifulSoup(sys.stdin.read())
403 # Regular expression to match stub in URL.
404 stub_pattern = re.compile(r'.*\/(.+)\/$')
406 for x in soup.findAll('item'):
408 if x.find('wp:status').string != 'publish': continue
410 match = stub_pattern.match(x.guid.string)
412 stub = match.groups()[0]
414 # Fall back to our own stubs
415 stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
417 commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
418 timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
419 content = '\[[!meta title="%s"]]\n' % (x.title.string.replace('"', r'\"'))
420 content += "\[[!meta date=\"%s\"]]\n" % datetime.fromtimestamp(timestamp)
421 content += x.find('content:encoded').string.replace('\r\n', '\n')
424 We do it differently here because we have duplicates otherwise.
426 <category><![CDATA[Health]]></category>
427 <category domain="category" nicename="health"><![CDATA[Health]]></category>
429 If we do the what original did, we end up with all tags and cats doubled.
430 Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
431 I'd much rather have the value of 'nicename', and tried, but my
432 python skillz are extremely limited....
434 categories = x.findAll('category', nicename=True)
437 for cat in categories:
438 # remove 'tags/' because we have a 'tagbase' set.
439 # your choice: 'tag', or 'taglink'
440 # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
441 content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
442 # this is just debugging, and for fun
443 # print >>sys.stderr, cat.string.replace(' ', '-')
445 # moved this thing down
446 data = content.encode('ascii', 'html_replace')
447 print "commit refs/heads/%s" % branch
448 print "committer %s <%s> %d +0000" % (name, email, timestamp)
449 print "data %d" % len(commit_msg)
451 print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
452 print "data %d" % len(data)
455 if __name__ == "__main__":
456 if len(sys.argv) not in (4, 5):
457 print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])