doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn

   1 [[!meta title="ikiwiki-wordpress-import"]]
   2
   3 I converted the script to Perl.  The new version gets your name and email automatically from your git config, converts the body of your posts to markdown, and also imports comments.  More importantly it works with the latest wordpress, which the python version does not.  Note that it's still not 100% perfect and I intend to make a few modifications still, but they will require access to the mysql database and that may render the script useless to some users.
   4
   5 -----
   6 [[!format perl '''
   7 #!/usr/bin/env perl
   8
   9 use 5.16.1;
  10 use warnings;
  11
  12 use XML::Simple;
  13 use DateTime::Format::Strptime;
  14 use HTML::WikiConverter;
  15 use LWP::UserAgent;
  16 use Try::Tiny;
  17 use Digest::MD5 'md5_hex';
  18
  19 die "usage: $0 import_file subdir [branch] | git-fast-import"
  20    unless @ARGV == 2 or @ARGV == 3;
  21
  22 chomp(my $name = qx(git config --get user.name));
  23 chomp(my $email = qx(git config --get user.email));
  24
  25 my ($file, $subdir, $branch) = @ARGV;
  26
  27 my %events;
  28
  29 POST:
  30 for my $x (grep $_->{'wp:status'} eq 'publish', @{XMLin($file)->{channel}{item}}) {
  31    state $date_parser = DateTime::Format::Strptime->new(
  32       pattern => '%F %T',
  33       time_zone => 'UTC',
  34    );
  35
  36    my $stub = $x =~ m<([^/]+)\/$>
  37       ? $1
  38       : lc($x->{title} =~ s/\W/-/gr =~ s/-$//r)
  39    ;
  40
  41    my $guid = $x->{guid}{content} || $x->{link};
  42    utf8::encode($x->{title});
  43    my $msg = qq($x->{title}\n\nfrom WordPress [$guid]);
  44    my $timestamp = $date_parser
  45       ->parse_datetime($x->{'wp:post_date_gmt'})
  46       ->epoch;
  47
  48    my $c = $x->{category};
  49    $c = [$c] if ref $c && ref $c ne 'ARRAY';
  50
  51    my $content =
  52       sprintf(qq([[!meta title="%s"]]\n), $x->{title} =~ s/"/\\"/gr) .
  53       convert_content($x->{'content:encoded'}) . "\n\n" .
  54       join("\n",
  55          map '[[!tag ' . s/ /-/r . ']]',
  56          keys %{
  57             +{
  58                map { $_ => 1 }
  59                grep $_ ne 'uncategorized',
  60                map $_->{nicename},
  61                @$c
  62             }
  63          }
  64       );
  65
  66    $events{$timestamp} = join "\n",
  67       "commit refs/heads/$branch",
  68       "committer $name <$email> $timestamp +0000",
  69       'data <<8675309',
  70       $msg,
  71       '8675309',
  72       "M 644 inline $subdir/$stub.mdwn",
  73       'data <<8675309',
  74       $content,
  75       '8675309'
  76    ;
  77
  78    get_comments($x->{link}, "$subdir/$stub")
  79       if $x->{'wp:post_type'} eq 'post'
  80 }
  81
  82 sub get_comments {
  83    my ($url, $dir) = @_;
  84
  85    state $ua = LWP::UserAgent->new;
  86
  87    my $content = $ua->get("$url/feed")->decoded_content;
  88    my $first;
  89    my $bail;
  90    my $decoded =
  91       try { XMLin($content, ForceArray => ['item']) }
  92       catch { $bail = 1 };
  93
  94    return if $bail;
  95
  96    COMMENT:
  97    for my $x (@{$decoded->{channel}{item}}) {
  98       my $date = $x->{pubDate};
  99       $date =~ s/^\S+\s//;
 100       $date =~ s/\s\S+$//;
 101
 102       #ghetto
 103       $date =~ s/Jan/01/;
 104       $date =~ s/Feb/02/;
 105       $date =~ s/Mar/03/;
 106       $date =~ s/Apr/04/;
 107       $date =~ s/May/05/;
 108       $date =~ s/Jun/06/;
 109       $date =~ s/Jul/07/;
 110       $date =~ s/Aug/08/;
 111       $date =~ s/Sep/09/;
 112       $date =~ s/Oct/10/;
 113       $date =~ s/Nov/11/;
 114       $date =~ s/Dec/12/;
 115
 116       state $date_parser = DateTime::Format::Strptime->new(
 117          pattern => '%d %m %Y %T',
 118          time_zone => 'UTC',
 119       );
 120
 121       my $datetime = $date_parser
 122          ->parse_datetime($date);
 123
 124       my $timestamp = $datetime->epoch;
 125       my $formatted_date = "$timestamp";
 126
 127       my $msg = 'Added a comment';
 128       my $content = convert_content($x->{'content:encoded'});
 129       utf8::encode($x->{'dc:creator'});
 130
 131       $events{$timestamp} = join "\n",
 132          "commit refs/heads/$branch",
 133          # still need to get email address
 134          "committer $x->{'dc:creator'} <$x->{'dc:creator'}> $timestamp +0000",
 135          'data <<8675309',
 136          $msg,
 137          '8675309',
 138          "M 644 inline " . unique_comment_location($dir, $content),
 139          'data <<8675309',
 140
 141       <<"COMMENT",
 142 [[!comment format=mdwn
 143  username="$x->{'dc:creator'}"
 144  date="$formatted_date"
 145  content="""
 146 $content
 147 """]]
 148 COMMENT
 149       '8675309'
 150       ;
 151    }
 152 }
 153
 154 say $events{$_} for sort keys %events;
 155
 156 sub convert_content {
 157    my $body = shift;
 158
 159    utf8::encode($body);
 160
 161    state $converter = HTML::WikiConverter->new(
 162       dialect              => 'Markdown',
 163       link_style           => 'inline',
 164       unordered_list_style => 'dash',
 165       image_style          => 'inline',
 166       image_tag_fallback   => 0,
 167    );
 168
 169    # I know I know you can't parse XML with regular expressions.  Go find a real
 170    # parser and send me a patch
 171    my $in_code = 0;
 172
 173    my $start_code = qr(<pre[^>]*>);
 174    # (?:) is a no op but keeps ikiwiki from breaking my script
 175    my $end_code = qr(</p(?:)re>);
 176
 177    $body =~ s(&#(?:8217|039);)(')g;
 178    $body =~ s(&(?:quot|#822[01]);)(")g;
 179    $body =~ s(&lt;)(<)g;
 180    $body =~ s(&gt;)(>)g;
 181    $body =~ s(&amp;)(&)g;
 182    $body =~ s(&#8230;)(...)g;
 183    $body =~ s(&#821[12];)(-)g;
 184    $body =~ s(&#8216;)(')g;
 185    $body =~ s(&#8242;)(')g;
 186    $body =~ s(&infin;)(∞)g;
 187    $body =~ s(&nbsp;)()g;
 188    $body =~ s(<code[^>]*>)(<p(?:)re>)g;
 189    $body =~ s(</c(?:)ode>)(</p(?:)re>)g;
 190
 191    my @tokens =
 192       map {; split qr[(?=<p(?:)re>)] }
 193       map {; split qr[</p(?:)re>\K] }
 194       split /\n\n/,
 195       $body;
 196
 197    my @new_tokens;
 198    for my $t (@tokens) {
 199       if (
 200          ($in_code && $t !~ $end_code) ||
 201          ($t =~ $start_code && $t =~ $end_code)
 202       ) {
 203          # do nothing
 204       } elsif ($t =~ $start_code) {
 205          $in_code = 1;
 206       } elsif ($t =~ $end_code) {
 207          $in_code = 0;
 208       } else {
 209          die "$t !!! '$1'" if $t =~ m/&([^;\s]+);/ && $1 !~ /[lg]t/;
 210
 211          $t = "<p>$t</p>"
 212       }
 213       push @new_tokens, $t
 214    }
 215
 216    $converter->html2wiki(join "\n\n", @new_tokens)
 217 }
 218
 219 sub unique_comment_location {
 220    my ($dir, $content) = @_;
 221
 222    utf8::encode($content);
 223    my $md5 = md5_hex($content);
 224
 225    my $location;
 226    my $i = 0;
 227    do {
 228       $i++;
 229       $location = "$dir/comment_${i}_$md5._comment";
 230    } while -e $location;
 231
 232    return $location
 233 }
 234
 235 ''']]
 236 -----
 237
 238 I modified the script a bit so categories and tags would actually show up in the output file.
 239
 240 -----
 241 [[!format python '''
 242 #!/usr/bin/env python
 243
 244 """
 245     Purpose:
 246     Wordpress-to-Ikiwiki import tool
 247
 248     Copyright:
 249     Copyright (C) 2007  Chris Lamb <chris@chris-lamb.co.uk>
 250
 251     This program is free software: you can redistribute it and/or modify
 252     it under the terms of the GNU General Public License as published by
 253     the Free Software Foundation, either version 3 of the License, or
 254     (at your option) any later version.
 255
 256     This program is distributed in the hope that it will be useful,
 257     but WITHOUT ANY WARRANTY; without even the implied warranty of
 258     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 259     GNU General Public License for more details.
 260
 261     You should have received a copy of the GNU General Public License
 262     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 263
 264     Usage: run --help as an argument with this script.
 265
 266     Notes:
 267     I added some extra bits to include the \[[!tag foo]] stuff in the post,
 268     as it wasn't before, at all. I'll diff the versions out so you can see
 269     the mess I made :).
 270
 271 """
 272
 273 import os, sys
 274 import time
 275 import re
 276
 277 from BeautifulSoup import BeautifulSoup
 278
 279 import codecs, htmlentitydefs
 280
 281 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
 282     % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
 283
 284 def main(name, email, subdir, branch='master'):
 285     soup = BeautifulSoup(sys.stdin.read())
 286
 287     # Regular expression to match stub in URL.
 288     stub_pattern = re.compile(r'.*\/(.+)\/$')
 289
 290     for x in soup.findAll('item'):
 291         # Ignore draft posts
 292         if x.find('wp:status').string != 'publish': continue
 293
 294         match = stub_pattern.match(x.guid.string)
 295         if match:
 296             stub = match.groups()[0]
 297         else:
 298             # Fall back to our own stubs
 299             stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
 300
 301         commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
 302         timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
 303
 304         content = '\[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
 305         content += x.find('content:encoded').string.replace('\r\n', '\n')
 306
 307         # categories = x.findAll('category')
 308         # categories = x.findAll({'category':True}, attrs={'domain':re.compile(('category|tag'))})
 309         # categories = x.findAll({'category':True}, domain=["category", "tag"])
 310         # categories = x.findAll({'category':True}, nicename=True)
 311         """
 312         We do it differently here because we have duplicates otherwise.
 313         Take a look:
 314         &lt;category&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
 315         &lt;category domain="category" nicename="health"&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
 316
 317         If we do the what original did, we end up with all tags and cats doubled.
 318         Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
 319         I'd much rather have the value of 'nicename', and tried, but my
 320         python skillz are extremely limited....
 321         """
 322         categories = x.findAll('category', nicename=True)
 323         if categories:
 324             content += "\n"
 325             for cat in categories:
 326                 # remove 'tags/' because we have a 'tagbase' set.
 327                 # your choice: 'tag', or 'taglink'
 328                 # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
 329                 content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
 330                 # print >>sys.stderr, cat.string.replace(' ', '-')
 331
 332         # moved this thing down
 333         data = content.encode('ascii', 'html_replace')
 334         print "commit refs/heads/%s" % branch
 335         print "committer %s &lt;%s&gt; %d +0000" % (name, email, timestamp)
 336         print "data %d" % len(commit_msg)
 337         print commit_msg
 338         print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
 339         print "data %d" % len(data)
 340         print data
 341
 342 if __name__ == "__main__":
 343     if len(sys.argv) not in (4, 5):
 344         print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
 345     else:
 346         main(*sys.argv[1:])
 347
 348 ''']]
 349 -----
 350
 351 I have another version of the script, which uses the `timestamp` from the script, and inserts that as a \[[!meta date="foodate"]]. I'm posting it here just in case I happen to be doing something to the httpd.
 352
 353 (Hopefully I've escaped everything properly; if I missed something, check the source.)
 354
 355 -----
 356 [[!format python '''
 357 #!/usr/bin/env python
 358
 359 """
 360     Purpose:
 361     Wordpress-to-Ikiwiki import tool
 362
 363     Copyright:
 364     Copyright (C) 2007  Chris Lamb <chris@chris-lamb.co.uk>
 365
 366     This program is free software: you can redistribute it and/or modify
 367     it under the terms of the GNU General Public License as published by
 368     the Free Software Foundation, either version 3 of the License, or
 369     (at your option) any later version.
 370
 371     This program is distributed in the hope that it will be useful,
 372     but WITHOUT ANY WARRANTY; without even the implied warranty of
 373     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 374     GNU General Public License for more details.
 375
 376     You should have received a copy of the GNU General Public License
 377     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 378
 379     Usage: run --help as an argument with this script.
 380
 381     Notes:
 382     I added some extra bits to include the \[[!tag foo]] stuff in the post,
 383     as it wasn't before, at all. I'll diff the versions out so you can see
 384     the mess I made :).
 385
 386 """
 387
 388 import os, sys
 389 import time
 390 import re
 391
 392 from datetime import datetime
 393 from BeautifulSoup import BeautifulSoup
 394
 395 import codecs, htmlentitydefs
 396
 397 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
 398     % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
 399
 400 def main(name, email, subdir, branch='master'):
 401     soup = BeautifulSoup(sys.stdin.read())
 402
 403     # Regular expression to match stub in URL.
 404     stub_pattern = re.compile(r'.*\/(.+)\/$')
 405
 406     for x in soup.findAll('item'):
 407         # Ignore draft posts
 408         if x.find('wp:status').string != 'publish': continue
 409
 410         match = stub_pattern.match(x.guid.string)
 411         if match:
 412             stub = match.groups()[0]
 413         else:
 414             # Fall back to our own stubs
 415             stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
 416
 417         commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
 418         timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
 419         content = '\[[!meta title="%s"]]\n' % (x.title.string.replace('"', r'\"'))
 420         content += "\[[!meta date=\"%s\"]]\n" % datetime.fromtimestamp(timestamp)
 421         content += x.find('content:encoded').string.replace('\r\n', '\n')
 422
 423         """
 424         We do it differently here because we have duplicates otherwise.
 425         Take a look:
 426         &lt;category&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
 427         &lt;category domain="category" nicename="health"&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
 428
 429         If we do the what original did, we end up with all tags and cats doubled.
 430         Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
 431         I'd much rather have the value of 'nicename', and tried, but my
 432         python skillz are extremely limited....
 433         """
 434         categories = x.findAll('category', nicename=True)
 435         if categories:
 436             content += "\n"
 437             for cat in categories:
 438                 # remove 'tags/' because we have a 'tagbase' set.
 439                 # your choice: 'tag', or 'taglink'
 440                 # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
 441                 content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
 442                 # this is just debugging, and for fun
 443                 # print >>sys.stderr, cat.string.replace(' ', '-')
 444
 445         # moved this thing down
 446         data = content.encode('ascii', 'html_replace')
 447         print "commit refs/heads/%s" % branch
 448         print "committer %s &lt;%s&gt; %d +0000" % (name, email, timestamp)
 449         print "data %d" % len(commit_msg)
 450         print commit_msg
 451         print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
 452         print "data %d" % len(data)
 453         print data
 454
 455 if __name__ == "__main__":
 456     if len(sys.argv) not in (4, 5):
 457         print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
 458     else:
 459         main(*sys.argv[1:])
 460
 461 ''']]
 462 -----
 463
 464
 465 [[!tag wordpress]]
 466 [[!tag python]]
 467 [[!tag conversion]]
 468 [[!tag ikiwiki]]