From: Joey Hess Date: Tue, 3 Jun 2008 19:29:54 +0000 (-0400) Subject: search: Converted to use xapian-omega. X-Git-Tag: 2.49~36 X-Git-Url: http://git.vanrenterghem.biz/git.ikiwiki.info.git/commitdiff_plain/8a6a5320edc2c8a2ed357463b61f161d5b295fbf?ds=sidebyside search: Converted to use xapian-omega. Everything is done except for the actual indexing. I plan to do incremental indexing as pages change. --- diff --git a/Bundle/IkiWiki/Extras.pm b/Bundle/IkiWiki/Extras.pm index f09225d49..9289968e7 100644 --- a/Bundle/IkiWiki/Extras.pm +++ b/Bundle/IkiWiki/Extras.pm @@ -16,6 +16,7 @@ perl -MCPAN -e 'install Bundle::IkiWiki::Extras' =head1 CONTENTS +Search::Xapian Authen::Passphrase RPC::XML File::MimeInfo diff --git a/IkiWiki/Plugin/search.pm b/IkiWiki/Plugin/search.pm index 9bf223cf0..e705d018a 100644 --- a/IkiWiki/Plugin/search.pm +++ b/IkiWiki/Plugin/search.pm @@ -1,5 +1,5 @@ #!/usr/bin/perl -# hyperestraier search engine plugin +# xapian-omega search engine plugin package IkiWiki::Plugin::search; use warnings; @@ -7,33 +7,32 @@ use strict; use IkiWiki 2.00; sub import { #{{{ - hook(type => "getopt", id => "hyperestraier", - call => \&getopt); - hook(type => "checkconfig", id => "hyperestraier", - call => \&checkconfig); - hook(type => "pagetemplate", id => "hyperestraier", - call => \&pagetemplate); - hook(type => "delete", id => "hyperestraier", - call => \&delete); - hook(type => "change", id => "hyperestraier", - call => \&change); - hook(type => "cgi", id => "hyperestraier", - call => \&cgi); + hook(type => "checkconfig", id => "search", call => \&checkconfig); + hook(type => "pagetemplate", id => "search", call => \&pagetemplate); + hook(type => "delete", id => "search", call => \&delete); + hook(type => "change", id => "search", call => \&change); + hook(type => "cgi", id => "search", call => \&cgi); } # }}} -sub getopt () { #{{{ - eval q{use Getopt::Long}; - error($@) if $@; - Getopt::Long::Configure('pass_through'); - GetOptions("estseek=s" => \$config{estseek}); -} #}}} - sub checkconfig () { #{{{ foreach my $required (qw(url cgiurl)) { if (! length $config{$required}) { error(sprintf(gettext("Must specify %s when using the search plugin"), $required)); } } + + if (! exists $config{omega_cgi}) { + $config{omega_cgi}="/usr/lib/cgi-bin/omega/omega"; + } + + if (! -e $config{wikistatedir}."/xapian" || $config{rebuild}) { + writefile("omega.conf", $config{wikistatedir}."/xapian", + "database_dir .\n". + "template_dir ./templates\n"); + writefile("query", $config{wikistatedir}."/xapian/templates", + IkiWiki::misctemplate(gettext("search"), + readfile(IkiWiki::template_file("searchquery.tmpl")))); + } } #}}} my $form; @@ -55,93 +54,22 @@ sub pagetemplate (@) { #{{{ } #}}} sub delete (@) { #{{{ - debug(gettext("cleaning hyperestraier search index")); - estcmd("purge -cl"); - estcfg(); + debug(gettext("cleaning xapian search index")); } #}}} sub change (@) { #{{{ - debug(gettext("updating hyperestraier search index")); - estcmd("gather -cm -bc -cl -sd", - map { - map { - Encode::encode_utf8($config{destdir}."/".$_) - } @{$renderedfiles{pagename($_)}}; - } @_ - ); - estcfg(); + debug(gettext("updating xapian search index")); } #}}} sub cgi ($) { #{{{ my $cgi=shift; - if (defined $cgi->param('phrase') || defined $cgi->param("navi")) { + if (defined $cgi->param('P')) { # only works for GET requests - chdir("$config{wikistatedir}/hyperestraier") || error("chdir: $!"); - exec("./".IkiWiki::basename($config{cgiurl})) || error("estseek.cgi failed"); - } -} #}}} - -my $configured=0; -sub estcfg () { #{{{ - return if $configured; - $configured=1; - - my $estdir="$config{wikistatedir}/hyperestraier"; - my $cgi=IkiWiki::basename($config{cgiurl}); - $cgi=~s/\..*$//; - - my $newfile="$estdir/$cgi.tmpl.new"; - my $cleanup = sub { unlink($newfile) }; - open(TEMPLATE, ">:utf8", $newfile) || error("open $newfile: $!", $cleanup); - print TEMPLATE IkiWiki::misctemplate("search", - "\n\n\n\n\n\n", - forcebaseurl => IkiWiki::dirname($config{cgiurl})."/") || - error("write $newfile: $!", $cleanup); - close TEMPLATE || error("save $newfile: $!", $cleanup); - rename($newfile, "$estdir/$cgi.tmpl") || - error("rename $newfile: $!", $cleanup); - - $newfile="$estdir/$cgi.conf"; - open(TEMPLATE, ">$newfile") || error("open $newfile: $!", $cleanup); - my $template=template("estseek.conf"); - eval q{use Cwd 'abs_path'}; - $template->param( - index => $estdir, - tmplfile => "$estdir/$cgi.tmpl", - destdir => abs_path($config{destdir}), - url => $config{url}, - ); - print TEMPLATE $template->output || error("write $newfile: $!", $cleanup); - close TEMPLATE || error("save $newfile: $!", $cleanup); - rename($newfile, "$estdir/$cgi.conf") || - error("rename $newfile: $!", $cleanup); - - $cgi="$estdir/".IkiWiki::basename($config{cgiurl}); - unlink($cgi); - my $estseek = defined $config{estseek} ? $config{estseek} : '/usr/lib/estraier/estseek.cgi'; - symlink($estseek, $cgi) || error("symlink $estseek $cgi: $!"); -} # }}} - -sub estcmd ($;@) { #{{{ - my @params=split(' ', shift); - push @params, "-cl", "$config{wikistatedir}/hyperestraier"; - if (@_) { - push @params, "-"; - } - - my $pid=open(CHILD, "|-"); - if ($pid) { - # parent - foreach (@_) { - print CHILD "$_\n"; - } - close(CHILD) || print STDERR "estcmd @params exited nonzero: $?\n"; - } - else { - # child - open(STDOUT, "/dev/null"); # shut it up (closing won't work) - exec("estcmd", @params) || error("can't run estcmd"); + chdir("$config{wikistatedir}/xapian") || error("chdir: $!"); + $ENV{OMEGA_CONFIG_FILE}="./omega.conf"; + $ENV{CGIURL}=$config{cgiurl}, + exec($config{omega_cgi}) || error("$config{omega_cgi} failed: $!"); } } #}}} diff --git a/debian/changelog b/debian/changelog index 02796394b..d80f78062 100644 --- a/debian/changelog +++ b/debian/changelog @@ -4,6 +4,7 @@ ikiwiki (2.49) UNRELEASED; urgency=low * ikiwiki-mass-rebuild: Don't trust $! when setting $) * inline: The optimisation in 2.41 broke nested inlines. Detect those and avoid overoptimising. + * search: Converted to use xapian-omega. -- Joey Hess Fri, 30 May 2008 19:08:54 -0400 diff --git a/debian/control b/debian/control index b71cbed6f..af281a74e 100644 --- a/debian/control +++ b/debian/control @@ -14,7 +14,7 @@ Package: ikiwiki Architecture: all Depends: ${perl:Depends}, markdown | libtext-markdown-perl, libhtml-scrubber-perl, libhtml-template-perl, libhtml-parser-perl, liburi-perl Recommends: gcc | c-compiler, libc6-dev | libc-dev, subversion | git-core (>= 1:1.5.0) | tla | bzr (>= 0.91) | mercurial | monotone (>= 0.38), libxml-simple-perl, libnet-openid-consumer-perl, liblwpx-paranoidagent-perl, libtimedate-perl, libcgi-formbuilder-perl (>= 3.05), libcgi-session-perl (>= 4.14-1), libmail-sendmail-perl, libauthen-passphrase-perl -Suggests: viewvc | gitweb | viewcvs, hyperestraier, librpc-xml-perl, libtext-wikiformat-perl, python, python-docutils, polygen, tidy, libxml-feed-perl, libmailtools-perl, perlmagick, libfile-mimeinfo-perl, libcrypt-ssleay-perl, liblocale-gettext-perl (>= 1.05-1), libtext-typography-perl, libtext-csv-perl, libdigest-sha1-perl, graphviz, libnet-amazon-s3-perl +Suggests: viewvc | gitweb | viewcvs, libsearch-xapian-perl, xapian-omega, librpc-xml-perl, libtext-wikiformat-perl, python, python-docutils, polygen, tidy, libxml-feed-perl, libmailtools-perl, perlmagick, libfile-mimeinfo-perl, libcrypt-ssleay-perl, liblocale-gettext-perl (>= 1.05-1), libtext-typography-perl, libtext-csv-perl, libdigest-sha1-perl, graphviz, libnet-amazon-s3-perl Conflicts: ikiwiki-plugin-table Replaces: ikiwiki-plugin-table Provides: ikiwiki-plugin-table diff --git a/doc/features.mdwn b/doc/features.mdwn index 1d762bed4..df963ab4f 100644 --- a/doc/features.mdwn +++ b/doc/features.mdwn @@ -158,8 +158,8 @@ Well, sorta. Rather than implementing YA history browser, it can link to ### Full text search -ikiwiki can use the [[HyperEstraier]] search engine to add powerful -full text search capabilities to your wiki. +ikiwiki can use the xapian search engine to add powerful +full text [[plugins/search]] capabilities to your wiki. ### [[w3mmode]] diff --git a/doc/ikiwiki.setup b/doc/ikiwiki.setup index db806a8c4..03d04176d 100644 --- a/doc/ikiwiki.setup +++ b/doc/ikiwiki.setup @@ -156,9 +156,9 @@ use IkiWiki::Setup::Standard { # base page. #tagbase => "tag", - # For use with the search plugin if your estseek.cgi is located + # For use with the search plugin if the omega cgi is located # somewhere else. - #estseek => "/usr/lib/estraier/estseek.cgi", + #omega_cgi => "/usr/lib/cgi-bin/omega/omega", # For use with the openid plugin, to give an url to a page users # can use to signup for an OpenID. diff --git a/doc/plugins/search.mdwn b/doc/plugins/search.mdwn index 7b32714f4..4c1b50fcd 100644 --- a/doc/plugins/search.mdwn +++ b/doc/plugins/search.mdwn @@ -1,12 +1,17 @@ [[template id=plugin name=search author="[[Joey]]"]] [[tag type/useful]] -This plugin is included in ikiwiki, but is not enabled by default. It adds -full text search to ikiwiki, using the [[HyperEstraier]] engine. +This plugin adds full text search to ikiwiki, using the +[xapian](http://xapian.org/) engine and its +[omega](http://xapian.org/docs/omega/overview.html) frontend. -It's possible to configure HyperEstraier via one of ikiwiki's -[[templates|wikitemplates]], but for most users, no configuration should be -needed aside from enabling the plugin. +Ikiwiki will handle indexing new and changed page contents, using the +[[cpan Search::Xapian]] perl modules. Note that it indexes page contents +before they are preprocessed and converted to html, as this tends to +produce less noisy search results. Also, since it only indexes page +contents, files copied by the [[rawhtml]] plugin will not be indexed, nor +will other types of data files. -This plugin has a configuration option. To change the path to estseek.cgi, -set `--estseek=/path/to/estseek.cgi` +There is one setting you may need to use in the config file. `omega_cgi` +should point to the location of the omega cgi program. The default location +is `/usr/lib/cgi-bin/omega/omega`. diff --git a/doc/plugins/search/discussion.mdwn b/doc/plugins/search/discussion.mdwn index 494d0a38a..6b5714c42 100644 --- a/doc/plugins/search/discussion.mdwn +++ b/doc/plugins/search/discussion.mdwn @@ -42,3 +42,5 @@ Now I did a `rm -rf ~wiki/wiki/.ikiwiki/hyperestraier` and re-ran `--rebuild`ing once more, I'm back to the previous error message. --[[tschwinge]] + +I guess this is fixed now that it uses xapian. :-) --[[Joey]] diff --git a/doc/todo/different_search_engine.mdwn b/doc/todo/different_search_engine.mdwn index 81ca47547..3737fb140 100644 --- a/doc/todo/different_search_engine.mdwn +++ b/doc/todo/different_search_engine.mdwn @@ -1,3 +1,5 @@ +[[done]], using xapian-omega! --[[Joey]] + After using it for a while, my feeling is that [[hyperestraier]], as used in the [[plugins/search]] plugin, is not robust enough for ikiwiki. It doesn't upgrade well, and it has a habit of sig-11 on certain input from time to diff --git a/doc/wikitemplates.mdwn b/doc/wikitemplates.mdwn index f095cb035..b03fc10a1 100644 --- a/doc/wikitemplates.mdwn +++ b/doc/wikitemplates.mdwn @@ -21,15 +21,14 @@ located in /usr/share/ikiwiki/templates by default. * `inlinepage.tmpl` - Used for adding a page inline in a blog page. * `archivepage.tmpl` - Used for listing a page in a blog archive page. -* `estseek.conf` - Not a html template, this is actually a template for - a config file for the [[HyperEstraier]] search engine. If you like you - can read the [[HyperEstraier]] docs and configure it using this. * `blogpost.tmpl` - Used for a form to add a post to a blog (and a rss/atom links) * `feedlink.tmpl` - Used to add rss/atom links if blogpost.tmpl is not used. * `aggregatepost.tmpl` - Used by the [[plugins/aggregate]] plugin to create a page for a post. * `searchform.tmpl` - Used by the [[plugins/search]] plugin to add a search form to wiki pages. +* `searchquery.tmpl` - This is an omega template, used by the + [[plugins/search]] plugin. The [[plugins/pagetemplate]] plugin can allow individual pages to use a different template than `page.tmpl`. diff --git a/templates/searchform.tmpl b/templates/searchform.tmpl index 7c4fdb026..d49cf22d3 100644 --- a/templates/searchform.tmpl +++ b/templates/searchform.tmpl @@ -1,7 +1,5 @@
- - - +
diff --git a/templates/searchquery.tmpl b/templates/searchquery.tmpl new file mode 100644 index 000000000..dd12d2970 --- /dev/null +++ b/templates/searchquery.tmpl @@ -0,0 +1,117 @@ +$set{thousand,$.}$set{decimal,.}$setmap{BN,,Any Country,uk,England,fr,France} +${ +$def{PREV, +$if{$ne{$topdoc,0},, +} +} + +$def{NEXT, +$if{$ne{$last,$msize},, +} +} + +$def{P,} +$def{PAGE,$if{$gt{$1,9},$if{$gt{$1,99},$P{$1,$div{$1,100}}}$P{$1,$mod{$div{$1,10},10}}}$P{$1,$mod{$1,10}}} + +$def{S,$1} +$def{SPAGE,$if{$gt{$1,9},$if{$gt{$1,99},$S{$1,$div{$1,100}}}$S{$1,$mod{$div{$1,10},10}}}$S{$1,$mod{$1,10}}} +} + +$def{PREV,$if{$ne{$topdoc,0},}} + +$def{PAGE,} + +$def{SPAGE,} + +$def{NEXT,$if{$ne{$last,$msize},}} + +

+ +

+
+ + +
+ +$if{$opt{topterms}, +
+ $map{$topterms,$prettyterm{$_} } +
+
+} +$or{$html{$error}, +$if{$eq{$msize,0}, +$if{$query,No documents match your query, +
Searching $nice{$dbsize} documents +}, +$if{$not{$msizeexact}, + $nice{$add{$topdoc,1}}-$nice{$last} of about $nice{$msize} matches, + $if{$and{$eq{$last,$msize},$eq{$topdoc,0}}, + All $nice{$msize} matches, + $nice{$add{$topdoc,1}}$if{$ne{$add{$topdoc,1},$last},-$nice{$last}} of exactly $nice{$msize} matches} +} +
+
+$list{$map{$queryterms,$list{$html{$uniq{$unstem{$_}}},,/,}: $nice{$freq{$_}}},Term frequencies: ,$. ,} +
Search took $time seconds + +$hitlist{ +} +
+${$percentage%} +
+
+
+
+ +Modified:
$html{$date{$field{modtime},%Y-%m-%d}}

+$if{$field{language},Language: $html{$field{language}}
} +$if{$field{size},Size: $html{$filesize{$field{size}}}
} +
+
$html{$or{$field{caption},$field{title},$field{url},Untitled}}
+$highlight{$field{sample},$terms}$if{$field{sample},...}
+$html{$field{url}}
+ +$percentage% relevant$. matching: +$list{$map{$terms,$html{$prettyterm{$_}}},$. , and }${for lynx:}

+ +
+ +${suppress next, prev, and page links if there's only one page} +$if{$ne{$lastpage,1}, +$set{a,$if{$opt{pagelink_height}, HEIGHT=$opt{pagelink_height}}$if{$opt{pagelink_width}, WIDTH=$opt{pagelink_width}}} + +${1-W ... X-(this)-Y ...} +$set{w,$min{3,$add{$thispage,-1}}} +$set{x,$max{$add{$opt{w},1},$add{$thispage,-3}}} +$set{y,$min{$lastpage,$add{$thispage,8}}} +$PREV +$map{$range{1,$opt{w}},$PAGE{$_}} +$if{$ne{$add{$opt{w},1},$opt{x}},...} +$map{$range{$opt{x},$add{$thispage,-1}},$PAGE{$_}} +$SPAGE{$thispage} +$map{$range{$add{$thispage,1},$opt{y}},$PAGE{$_}} +$if{$ne{$opt{y},$lastpage},...} +$NEXT +} +}} +

+$if{$dbname,} +$if{$ne{$topdoc,0},} +$if{$ne{$hitsperpage,10},} +$if{$fmt,} +$if{$cgi{COLLAPSE},} +$if{$queryterms,} + + +$list{$relevants,} +$if{$cgi{THRESHOLD},} +
+
$html{$version}