]> git.vanrenterghem.biz Git - git.ikiwiki.info.git/blob - IkiWiki/Plugin/htmlscrubber.pm
fix backport
[git.ikiwiki.info.git] / IkiWiki / Plugin / htmlscrubber.pm
1 #!/usr/bin/perl
2 package IkiWiki::Plugin::htmlscrubber;
4 use warnings;
5 use strict;
6 use IkiWiki;
8 sub import { #{{{
9         hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
10 } # }}}
12 sub sanitize (@) { #{{{
13         my %params=@_;
14         return scrubber()->scrub($params{content});
15 } # }}}
17 my $_scrubber;
18 sub scrubber { #{{{
19         return $_scrubber if defined $_scrubber;
20         
21         # Only known uri schemes are allowed to avoid all the ways of
22         # embedding javascrpt.
23         # List at http://en.wikipedia.org/wiki/URI_scheme
24         my $uri_schemes=join("|",
25                 # IANA registered schemes
26                 "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
27                 "aaa", "aaas", "acap",  "cap", "cid", "crid", 
28                 "dav", "dict", "dns", "fax", "go", "h323", "im", "imap",
29                 "ldap", "mid", "news", "nfs", "nntp", "pop", "pres",
30                 "sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",
31                 "z39.50r", "z39.50s",
32                 # data is a special case. Allow data:text/<image>, but
33                 # disallow data:text/javascript and everything else.
34                 qr/data:text\/(?:png|gif|jpeg)/,
35                 # Selected unofficial schemes
36                 "about", "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
37                 "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
38                 "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
39                 "sftp", "sms", "steam", "webcal", "ymsgr",
40         );
41         my $link=qr/^(?:$uri_schemes:|[^:]+$)/i;
43         eval q{use HTML::Scrubber};
44         error($@) if $@;
45         # Lists based on http://feedparser.org/docs/html-sanitization.html
46         $_scrubber = HTML::Scrubber->new(
47                 allow => [qw{
48                         a abbr acronym address area b big blockquote br
49                         button caption center cite code col colgroup dd del
50                         dfn dir div dl dt em fieldset font form h1 h2 h3 h4
51                         h5 h6 hr i img input ins kbd label legend li map
52                         menu ol optgroup option p pre q s samp select small
53                         span strike strong sub sup table tbody td textarea
54                         tfoot th thead tr tt u ul var
55                 }],
56                 default => [undef, { ( map { $_ => 1 } qw{
57                         abbr accept accept-charset accesskey
58                         align alt axis border cellpadding cellspacing
59                         char charoff charset checked cite class
60                         clear cols colspan color compact coords
61                         datetime dir disabled enctype for frame
62                         headers height hreflang hspace id ismap
63                         label lang longdesc maxlength media method
64                         multiple name nohref noshade nowrap prompt
65                         readonly rel rev rows rowspan rules scope
66                         selected shape size span start summary
67                         tabindex target title type usemap valign
68                         value vspace width
69                 } ),
70                 "/" => 1, # emit proper <hr /> XHTML
71                 href => $link,
72                 src => $link,
73                 action => $link,
74                 }],
75         );
76         return $_scrubber;
77 } # }}}
79 1