From 883d2141a805a1dd5d171cbc45ed6d7dd54d51bf Mon Sep 17 00:00:00 2001 From: Simon McVittie Date: Sat, 1 Dec 2018 21:05:18 +0000 Subject: [PATCH] Review --- ..._of_skipping_if_previously_translated.mdwn | 247 +++++++++++------- .../20180628-patch.txt | 85 ++++++ 2 files changed, 242 insertions(+), 90 deletions(-) create mode 100644 doc/todo/Re-use_translated_content_instead_of_skipping_if_previously_translated/20180628-patch.txt diff --git a/doc/todo/Re-use_translated_content_instead_of_skipping_if_previously_translated.mdwn b/doc/todo/Re-use_translated_content_instead_of_skipping_if_previously_translated.mdwn index 749551ccc..0483f2c10 100644 --- a/doc/todo/Re-use_translated_content_instead_of_skipping_if_previously_translated.mdwn +++ b/doc/todo/Re-use_translated_content_instead_of_skipping_if_previously_translated.mdwn @@ -1,99 +1,53 @@ - From: Chris Lamb - Date: Thu, 28 Jun 2018 19:30:15 +0100 - Subject: [PATCH] Re-use translated content instead of skipping if previously - translated. - - This fixes an issue where an initial `inline` directive would be translated - correctly, but subsequent inlines of the same would result in the raw - contents of the `.po` file being inserted into the page instead. - - For example, given a `index.mdwn` containing: - - \[[!inline pages="inline" raw="yes"]] - \[[!inline pages="inline" raw="yes"]] - - .. and an `index.de.po` of: - - msgid "\[[!inline pages=\"inline\" raw=\"yes\"]]\n" - msgstr "\[[!inline pages=\"inline.de\" raw=\"yes\"]]\n" - - .. together with an `inline.mdwn` of: - - This is inlined content. - - .. and an `inline.de.po` of: - - msgid "This is inlined content." - msgstr "This is German inlined content." - - .. would result in the following translation: - - This is the inlined content. - # SOME DESCRIPTIVE TITLE - # Copyright (C) YEAR Free Software Foundation, Inc. - # This file is distributed under the same license as the PACKAGE package. - # FIRST AUTHOR , YEAR. - - .. instead of, of course: - - This is the inlined content. - This is the inlined content. - --- - IkiWiki/Plugin/po.pm | 15 +++++++++------ - 1 file changed, 9 insertions(+), 6 deletions(-) - - diff --git a/IkiWiki/Plugin/po.pm b/IkiWiki/Plugin/po.pm - index 418e8e58a..ecd1f5499 100644 - --- a/IkiWiki/Plugin/po.pm - +++ b/IkiWiki/Plugin/po.pm - @@ -303,9 +303,12 @@ sub filter (@) { - my $page = $params{page}; - my $destpage = $params{destpage}; - my $content = $params{content}; - - if (istranslation($page) && ! alreadyfiltered($page, $destpage)) { - - $content = po_to_markup($page, $content); - - setalreadyfiltered($page, $destpage); - + if (istranslation($page)) { - + if (!defined(alreadyfiltered($page, $destpage))) { - + $content = po_to_markup($page, $content); - + setalreadyfiltered($page, $destpage, $content); - + } - + $content = alreadyfiltered($page, $destpage); - } - return $content; - } - @@ -747,15 +750,15 @@ sub myisselflink ($$) { - my $page=shift; - my $destpage=shift; - - - return exists $filtered{$page}{$destpage} - - && $filtered{$page}{$destpage} eq 1; - + return $filtered{$page}{$destpage}; - } - - sub setalreadyfiltered($$) { - my $page=shift; - my $destpage=shift; - + my $content=shift; - - - $filtered{$page}{$destpage}=1; - + $filtered{$page}{$destpage}=$content; - } - - sub unsetalreadyfiltered($$) { - -- - 2.18.0 - +There is an issue where an initial "inline" directive would be +translated correctly but subsequent inlines of the same page would +result in the raw contents of the ".po" file (ie. starting with the raw +copyright headers!) being inserted into the page instead. + +For example, given a "index.mdwn" containing: + + [[!inline pages="inline" raw="yes"]] + [[!inline pages="inline" raw="yes"]] + +… and an "index.de.po" of: + + msgid "[[!inline pages=\"inline\" raw=\"yes\"]]\n" + msgstr "[[!inline pages=\"inline.de\" raw=\"yes\"]]\n" + +… together with an "inline.mdwn" of: + + This is inlined content. + +… and an "inline.de.po" of: + + msgid "This is inlined content." + msgstr "This is German inlined content." + +§ + +This would result in the following translation: + + This is the inlined content. + # SOME DESCRIPTIVE TITLE + # Copyright (C) YEAR Free Software Foundation, Inc. + # This file is distributed under the same license as the PACKAGE package. + # FIRST AUTHOR , YEAR. + +… instead of (of course) + + This is the inlined content. + This is the inlined content. + +[[Initially proposed patch from Chris Lamb|20180628-patch.txt]] + [[!tag patch]] > Thank you Chris! I've reviewed the patch (with my "original author of the po plugin" hat on) and it looks good to me. I'm not 100% sure about `alreadyfiltered` being the best name for something that's not a predicated anymore but it's good enough. Then I wore my end-user hat and confirmed that with Chris' patch applied, the reproducer we had for this bug at Tails works fine. So IMO we're good to go and I recommend to apply this patch. Thanks in advance! -- [[intrigeri]] - > Any update on getting this merged? — [[lamby]], Fri, 24 Aug 2018 12:36:37 +0200 > Indeed, would love to see this merged! What might be the next steps here? — [[lamby]], Thu, 18 Oct 2018 17:57:37 -0400 -> I've filed this in Debian GNU/Linux at https://bugs.debian.org/911356 — [[lamby]], Thu, 18 Oct 2018 20:18:58 -0400 +> I've filed this in Debian GNU/Linux at — [[lamby]], Thu, 18 Oct 2018 20:18:58 -0400 >> As I said on the Debian bug, I think we badly need test coverage for >> this sort of thing, otherwise it *will* regress. The po plugin is @@ -103,5 +57,118 @@ >> bugs) without anyone realising. >> >> For now I've added a failing test-case for this particular bug. ->> I'll try to review this soon and understand the po plugin, the patch, ->> why the bug exists and why the patch fixes it. --[[smcv]] +>> --[[smcv]] + +--- + +Review from [[smcv]]: + +The patch attached to the Debian bug and the patch pasted here (which +I've moved to an attachment) appear to be different, but I'm not going to +do a line-by-line review of the patches and their differences for now +because I'm not sure their approach is fully correct. + +As we know, the two hardest things in computer science are naming, cache +invalidation and off-by-one errors. Unfortunately this patch has issues +with naming and cache invalidation. It's effectively turning the +`alreadyfiltered` mechanism into a cache of memoized results of calling +`po_to_markup()` on pages' content, keyed by the page name and the +`destpage`, which is either the page name again or the name of a page +into which the `page` is to be inlined (even though the result of +`po_to_markup()` doesn't actually vary with the `destpage`). + +This naturally raises the usual concerns about having a cache: + +* How large does it grow? +* Do we invalidate it every time we need to? +* Do we even need it? + +The cache size is mainly a concern for large wikis being rebuilt. If you +have a wiki with 1000 translated pages in 3 languages each, each of which +is inlined into an average of one other page, then by the time you finish +a rebuild you'll be holding 6000 translated pages in memory. If we change +the `alreadyfiltered` mechanism to be keyed by the page name and not the +(page, destpage) pair, that drops to 3000, but that's still +O(pages \* languages) which seems like a lot. As a general design +principle, ikiwiki tries not to hold full content in RAM for more than +the currently-processed page. + +We invalidate the `alreadyfiltered` for a (page, page) pair in an +editcontent hook, and we never invalidate (page, destpage) pairs for +page != destpage at all. Are we sure there is no other circumstance in +which the content of a page can change? + +One of the things I tried doing for a simple solution was to remove the +cache altogether, because I wasn't sure why we had this `alreadyfiltered` +mechanism in the first place. This passes tests, which suggests that +either the `alreadyfiltered` mechanism is unnecessary, or our regression +test coverage for `po` is insufficient. + +Looking back at the history of the `po` plugin, it seems that the +`alreadyfiltered` mechanism was introduced (under a different name, +with less abstraction) by [[intrigeri]] in commit 1e874b3f: + +``` +po plugin[filter]: avoid converting more than once per destfile + +Only the first filter function call on a given {page,destpage} must convert it +from the PO file, subsequent calls must leave the passed $content unmodified. + +Else, preprocessing loops are the rule. +``` + +I don't understand this. Under what circumstances would we pass content +through the filter hooks, and then pass it back through the same filter +hooks? Can we not do that, instead? If at all possible we should at +least have test coverage for the situation where this happened (but I +can't add this without knowing what it was). + +I feel as though it should be an invariant that the output of a filter +hook is never passed back through filter hooks: otherwise every filter +hook would have to be able to be able to detect and skip processing +its own output, which is not necessarily even possible. For instance, +suppose you had a plugin with a filter that turned tab-separated text +files into `` markup: every HTML file that doesn't contain tabs +is trivially a TSV file with one column, so you can't know whether a +blob of text is TSV or HTML. + +I wondered whether the loops referenced in 1e874b3f might have been +fixed in 192ce7a2: + + remove unnecessary and troublesome filter calls + + This better defines what the filter hook is passed, to only be the raw, + complete text of a page. Not some snippet, or data read in from an + unrelated template. + + Several plugins that filtered text that originates from an (already + filtered) page were modified not to do that. Note that this was not + done very consistently before; other plugins that receive text from a + page called preprocess on it w/o first calling filter. + + The template plugin gets text from elsewhere, and was also changed not to + filter it. That leads to one known regression -- the embed plugin cannot + be used to embed stuff in templates now. But that plugin is deprecated + anyway. + + Later we may want to increase the coverage of what is filtered. Perhaps + a good goal would be to allow writing a filter plugin that filters + out unwanted words, from any input. We're not there yet; not only + does the template plugin load unfiltered text from its templates now, + but so can the table plugin, and other plugins that use templates (like + inline!). I think we can cross that bridge when we come to it. If I wanted + such a censoring plugin, I'd probably make it use a sanitize hook instead, + for the better coverage. + + For now I am concentrating on the needs of the two non-deprecated users + of filter. This should fix bugs/po_vs_templates, and it probably fixes + an obscure bug around txt's use of filter for robots.txt. + +but I'm not sure that any of the redundant filtering removed in that +commit was actually relevant to `po` users? + +[[intrigeri]], can you shed any light on this? + +Naming is the easy part of this review: the `alreadyfiltered` family of +functions are not named like cache getter/setter functions. This could +be resolved by renaming. diff --git a/doc/todo/Re-use_translated_content_instead_of_skipping_if_previously_translated/20180628-patch.txt b/doc/todo/Re-use_translated_content_instead_of_skipping_if_previously_translated/20180628-patch.txt new file mode 100644 index 000000000..217352f19 --- /dev/null +++ b/doc/todo/Re-use_translated_content_instead_of_skipping_if_previously_translated/20180628-patch.txt @@ -0,0 +1,85 @@ +From: Chris Lamb +Date: Thu, 28 Jun 2018 19:30:15 +0100 +Subject: [PATCH] Re-use translated content instead of skipping if previously + translated. + +This fixes an issue where an initial `inline` directive would be translated +correctly, but subsequent inlines of the same would result in the raw +contents of the `.po` file being inserted into the page instead. + +For example, given a `index.mdwn` containing: + + \[[!inline pages="inline" raw="yes"]] + \[[!inline pages="inline" raw="yes"]] + +.. and an `index.de.po` of: + + msgid "\[[!inline pages=\"inline\" raw=\"yes\"]]\n" + msgstr "\[[!inline pages=\"inline.de\" raw=\"yes\"]]\n" + +.. together with an `inline.mdwn` of: + + This is inlined content. + +.. and an `inline.de.po` of: + + msgid "This is inlined content." + msgstr "This is German inlined content." + +.. would result in the following translation: + + This is the inlined content. + # SOME DESCRIPTIVE TITLE + # Copyright (C) YEAR Free Software Foundation, Inc. + # This file is distributed under the same license as the PACKAGE package. + # FIRST AUTHOR , YEAR. + +.. instead of, of course: + + This is the inlined content. + This is the inlined content. +--- + IkiWiki/Plugin/po.pm | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +diff --git a/IkiWiki/Plugin/po.pm b/IkiWiki/Plugin/po.pm +index 418e8e58a..ecd1f5499 100644 +--- a/IkiWiki/Plugin/po.pm ++++ b/IkiWiki/Plugin/po.pm +@@ -303,9 +303,12 @@ sub filter (@) { + my $page = $params{page}; + my $destpage = $params{destpage}; + my $content = $params{content}; +- if (istranslation($page) && ! alreadyfiltered($page, $destpage)) { +- $content = po_to_markup($page, $content); +- setalreadyfiltered($page, $destpage); ++ if (istranslation($page)) { ++ if (!defined(alreadyfiltered($page, $destpage))) { ++ $content = po_to_markup($page, $content); ++ setalreadyfiltered($page, $destpage, $content); ++ } ++ $content = alreadyfiltered($page, $destpage); + } + return $content; + } +@@ -747,15 +750,15 @@ sub myisselflink ($$) { + my $page=shift; + my $destpage=shift; + +- return exists $filtered{$page}{$destpage} +- && $filtered{$page}{$destpage} eq 1; ++ return $filtered{$page}{$destpage}; + } + + sub setalreadyfiltered($$) { + my $page=shift; + my $destpage=shift; ++ my $content=shift; + +- $filtered{$page}{$destpage}=1; ++ $filtered{$page}{$destpage}=$content; + } + + sub unsetalreadyfiltered($$) { +-- +2.18.0 -- 2.39.2