]> git.vanrenterghem.biz Git - git.ikiwiki.info.git/blobdiff - IkiWiki/Plugin/htmlscrubber.pm
(no commit message)
[git.ikiwiki.info.git] / IkiWiki / Plugin / htmlscrubber.pm
index 25caa8a506cdf5ab8ea0c9b0d3e9e935f62bd6f8..927792f791f2160232671e1327db2a379a496e9d 100644 (file)
@@ -3,25 +3,20 @@ package IkiWiki::Plugin::htmlscrubber;
 
 use warnings;
 use strict;
 
 use warnings;
 use strict;
-use IkiWiki 2.00;
+use IkiWiki 3.00;
 
 
-sub import { #{{{
-       hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
-} # }}}
+# This regexp matches urls that are in a known safe scheme.
+# Feel free to use it from other plugins.
+our $safe_url_regexp;
 
 
-sub sanitize (@) { #{{{
-       my %params=@_;
-       return scrubber()->scrub($params{content});
-} # }}}
+sub import {
+       hook(type => "getsetup", id => "htmlscrubber", call => \&getsetup);
+       hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
 
 
-my $_scrubber;
-sub scrubber { #{{{
-       return $_scrubber if defined $_scrubber;
-       
        # Only known uri schemes are allowed to avoid all the ways of
        # embedding javascrpt.
        # List at http://en.wikipedia.org/wiki/URI_scheme
        # Only known uri schemes are allowed to avoid all the ways of
        # embedding javascrpt.
        # List at http://en.wikipedia.org/wiki/URI_scheme
-       my $uri_schemes=join("|",
+       my $uri_schemes=join("|", map quotemeta,
                # IANA registered schemes
                "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
                "aaa", "aaas", "acap",  "cap", "cid", "crid", 
                # IANA registered schemes
                "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
                "aaa", "aaas", "acap",  "cap", "cid", "crid", 
@@ -29,21 +24,55 @@ sub scrubber { #{{{
                "ldap", "mid", "news", "nfs", "nntp", "pop", "pres",
                "sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",
                "z39.50r", "z39.50s",
                "ldap", "mid", "news", "nfs", "nntp", "pop", "pres",
                "sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",
                "z39.50r", "z39.50s",
-               # data is a special case. Allow data:text/<image>, but
-               # disallow data:text/javascript and everything else.
-               qr/data:text\/(?:png|gif|jpeg)/,
                # Selected unofficial schemes
                # Selected unofficial schemes
-               "about", "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
+               "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
                "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
                "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
                "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
                "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
-               "sftp", "sms", "steam", "webcal", "ymsgr",
+               "sftp", "smb", "sms", "snews", "webcal", "ymsgr",
        );
        );
-       my $link=qr/^(?:$uri_schemes:|[^:]+$)/i;
+       # data is a special case. Allow a few data:image/ types,
+       # but disallow data:text/javascript and everything else.
+       $safe_url_regexp=qr/^(?:(?:$uri_schemes):|data:image\/(?:png|jpeg|gif)|[^:]+(?:$|[\/\?#]))|^#/i;
+}
+
+sub getsetup () {
+       return
+               plugin => {
+                       safe => 1,
+                       rebuild => undef,
+                       section => "core",
+               },
+               htmlscrubber_skip => {
+                       type => "pagespec",
+                       example => "!*/Discussion",
+                       description => "PageSpec specifying pages not to scrub",
+                       link => "ikiwiki/PageSpec",
+                       safe => 1,
+                       rebuild => undef,
+               },
+}
+
+sub sanitize (@) {
+       my %params=@_;
+
+       if (exists $config{htmlscrubber_skip} &&
+           length $config{htmlscrubber_skip} &&
+           exists $params{destpage} &&
+           pagespec_match($params{destpage}, $config{htmlscrubber_skip})) {
+               return $params{content};
+       }
+
+       return scrubber()->scrub($params{content});
+}
+
+my $_scrubber;
+sub scrubber {
+       return $_scrubber if defined $_scrubber;
 
        eval q{use HTML::Scrubber};
        error($@) if $@;
        # Lists based on http://feedparser.org/docs/html-sanitization.html
 
        eval q{use HTML::Scrubber};
        error($@) if $@;
        # Lists based on http://feedparser.org/docs/html-sanitization.html
-       # With html 5 video and audio tags added.
+       # With html5 tags added.
        $_scrubber = HTML::Scrubber->new(
                allow => [qw{
                        a abbr acronym address area b big blockquote br br/
        $_scrubber = HTML::Scrubber->new(
                allow => [qw{
                        a abbr acronym address area b big blockquote br br/
@@ -53,33 +82,45 @@ sub scrubber { #{{{
                        menu ol optgroup option p p/ pre q s samp select small
                        span strike strong sub sup table tbody td textarea
                        tfoot th thead tr tt u ul var
                        menu ol optgroup option p p/ pre q s samp select small
                        span strike strong sub sup table tbody td textarea
                        tfoot th thead tr tt u ul var
-                       video audio
+
+                       video audio source section nav article aside hgroup
+                       header footer figure figcaption time mark canvas
+                       datalist progress meter ruby rt rp details summary
                }],
                default => [undef, { (
                        map { $_ => 1 } qw{
                                abbr accept accept-charset accesskey
                                align alt axis border cellpadding cellspacing
                }],
                default => [undef, { (
                        map { $_ => 1 } qw{
                                abbr accept accept-charset accesskey
                                align alt axis border cellpadding cellspacing
-                               char charoff charset checked cite class
+                               char charoff charset checked class
                                clear cols colspan color compact coords
                                datetime dir disabled enctype for frame
                                headers height hreflang hspace id ismap
                                clear cols colspan color compact coords
                                datetime dir disabled enctype for frame
                                headers height hreflang hspace id ismap
-                               label lang longdesc maxlength media method
+                               label lang maxlength media method
                                multiple name nohref noshade nowrap prompt
                                readonly rel rev rows rowspan rules scope
                                selected shape size span start summary
                                multiple name nohref noshade nowrap prompt
                                readonly rel rev rows rowspan rules scope
                                selected shape size span start summary
-                               tabindex target title type usemap valign
+                               tabindex target title type valign
                                value vspace width
                                value vspace width
-                               autoplay loopstart loopend end
-                               playcount controls 
+
+                               autofocus autoplay preload loopstart
+                               loopend end playcount controls pubdate
+                               placeholder min max step low high optimum
+                               form required autocomplete novalidate pattern
+                               list formenctype formmethod formnovalidate
+                               formtarget reversed spellcheck open hidden
                        } ),
                        "/" => 1, # emit proper <hr /> XHTML
                        } ),
                        "/" => 1, # emit proper <hr /> XHTML
-                       href => $link,
-                       src => $link,
-                       action => $link,
-                       poster => $link,
+                       href => $safe_url_regexp,
+                       src => $safe_url_regexp,
+                       action => $safe_url_regexp,
+                       formaction => $safe_url_regexp,
+                       cite => $safe_url_regexp,
+                       longdesc => $safe_url_regexp,
+                       poster => $safe_url_regexp,
+                       usemap => $safe_url_regexp,
                }],
        );
        return $_scrubber;
                }],
        );
        return $_scrubber;
-} # }}}
+}
 
 1
 
 1