From 40eabc1a3393dc99e83b688178c8255f0483cadd Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Wed, 26 Jan 2022 17:33:58 +0000 Subject: [PATCH 01/29] Aggressively try to retain markup on words if it appears on one of its source tokens I do need those continuation delimiters for that, even though I really don't like them since they're so character set focussed! --- src/translator/html.cpp | 27 ++++++++++++++++++++++++--- src/translator/html.h | 6 +++--- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/translator/html.cpp b/src/translator/html.cpp index 20614f578..bcfcc15b6 100644 --- a/src/translator/html.cpp +++ b/src/translator/html.cpp @@ -138,6 +138,15 @@ bool containsTag(HTML::Taint const &stack, HTML::Tag const *tag) { return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend(); } +bool isSubset(HTML::Taint const &a, HTML::Taint const &b) { + if (a.size() > b.size()) return false; + + for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j) + if (*i != *j) return false; + + return true; +} + template AnnotatedText apply(AnnotatedText const &in, Fun fun) { AnnotatedText out; @@ -448,7 +457,7 @@ void HTML::restore(Response &response) { // Find for every token in target the token in source that best matches. std::vector> alignments; - hardAlignments(response, alignments); + hardAlignments(response, alignments, sourceTokenSpans); std::vector targetTokenSpans; copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans); @@ -591,7 +600,10 @@ bool HTML::isContinuation(string_view prev, string_view str) { options_.continuationDelimiters.find(prev.back()) == std::string::npos; } -void HTML::hardAlignments(Response const &response, std::vector> &alignments) { +void HTML::hardAlignments(Response const &response, std::vector> &alignments, + std::vector const &sourceTokenSpans) { + size_t offset = 0; + // For each sentence... for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) { alignments.emplace_back(); @@ -622,7 +634,14 @@ void HTML::hardAlignments(Response const &response, std::vector= prevScore) { + Taint const &currTaint = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags; + Taint const &prevTaint = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags; + + // If this token has more markup, or a better score than the previous + // token (and they together are part of a word-ish thing) then mark + // this word as aligning. Otherwise just copy the alignment source of + // the previous token. + if (isSubset(prevTaint, currTaint) || currScore >= prevScore) { // Apply this to all previous tokens in the word for (size_t i = t;; --i) { alignments.back()[i] = currSentenceIdx; @@ -640,6 +659,8 @@ void HTML::hardAlignments(Response const &response, std::vectorderline` should become `un der line`? @@ -86,7 +85,8 @@ class HTML { void copyTaint(Response const &response, std::vector> const &alignments, std::vector const &sourceTokenSpans, std::vector &targetTokenSpans); - void hardAlignments(Response const &response, std::vector> &alignments); + void hardAlignments(Response const &response, std::vector> &alignments, + std::vector const &sourceTokenSpans); bool isContinuation(string_view prev, string_view str); // Allocates tag in pool_ (which then owns it) and gives a pointer to be used // in Taints. Pointer is valid as long as this HTML instance lives on. From 723e725bbb6fbbaae703a887d757f0705d9dab5f Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Wed, 26 Jan 2022 21:06:32 +0000 Subject: [PATCH 02/29] Outdated todo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🎉 --- src/translator/response_options.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/translator/response_options.h b/src/translator/response_options.h index b5867d00d..8ccfab856 100644 --- a/src/translator/response_options.h +++ b/src/translator/response_options.h @@ -19,7 +19,7 @@ struct ResponseOptions { bool qualityScores{false}; ///< Include quality-scores or not. bool alignment{false}; ///< Include alignments or not. - bool HTML{false}; /// Remove HTML tags from text and (TODO) insert in output. + bool HTML{false}; /// Remove HTML tags from text and insert in output. /// Whether to include sentenceMappings or not. Alignments require /// sentenceMappings and are available irrespective of this option if From 9600c70b1725afabd07154355142b1b4ffebaf47 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 8 Feb 2022 13:00:53 +0000 Subject: [PATCH 03/29] Be explicit about where the two different string_view types are used --- src/translator/html.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/translator/html.cpp b/src/translator/html.cpp index bcfcc15b6..8e8090a5a 100644 --- a/src/translator/html.cpp +++ b/src/translator/html.cpp @@ -4,17 +4,16 @@ #include "xh_scanner.h" namespace { -using marian::string_view; using marian::bergamot::AnnotatedText; using marian::bergamot::ByteRange; using marian::bergamot::HTML; using marian::bergamot::Response; -void encodeEntities(string_view const &input, std::string &output) { +void encodeEntities(marian::string_view const &input, std::string &output) { output.clear(); output.reserve(input.size()); // assumes there are no entities in most cases - for (auto it = input.begin(); it != input.end(); ++it) { + for (const char *it = input.begin(); it != input.end(); ++it) { switch (*it) { case '&': output.append("&"); @@ -41,7 +40,7 @@ void encodeEntities(string_view const &input, std::string &output) { } } -size_t countPrefixWhitespaces(string_view const &input) { +size_t countPrefixWhitespaces(marian::string_view const &input) { size_t size = 0; while (size < input.size() && std::isspace(input[size])) ++size; return size; @@ -98,10 +97,10 @@ std::string format(std::string const &formatTemplate, Arg arg, Args... args) { // `for (auto &&item : reversed(container))` instead of the needlessly verbose // `for (auto it = container.rbegin(); it != container.rend(); ++it)` template -class reversed { +class Reversed { public: - typedef typename T::const_reverse_iterator iterator; - explicit reversed(T const &container) : container_(container){}; + using iterator = typename T::const_reverse_iterator; + explicit Reversed(T const &container) : container_(container){}; iterator begin() const { return container_.rbegin(); } iterator end() const { return container_.rend(); } @@ -167,9 +166,10 @@ AnnotatedText apply(AnnotatedText const &in, Fun fun) { // expects // TODO: extend AnnotatedText::appendSentence to accept str + ByteRanges // directly - std::vector views(tokens.size()); - std::transform(tokens.begin(), tokens.end(), views.begin(), - [&](ByteRange const &range) { return string_view(sentence.data() + range.begin, range.size()); }); + std::vector views(tokens.size()); + std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) { + return marian::string_view(sentence.data() + range.begin, range.size()); + }); out.appendSentence(prefix, views.begin(), views.end()); } @@ -200,8 +200,8 @@ bool hasAlignments(Response const &response) { // Little helper class to append HTML to a token class TokenFormatter { public: - explicit TokenFormatter(string_view token) - : html_(), offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) { + explicit TokenFormatter(marian::string_view token) + : offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) { // Do encoding of any entities that popped up in the translation encodeEntities(token, html_); } @@ -214,7 +214,7 @@ class TokenFormatter { diffTags(prev, curr, opening, closing); - for (HTML::Tag const *tag : reversed(closing)) { + for (HTML::Tag const *tag : Reversed(closing)) { assert(tag->type == HTML::Tag::ELEMENT); std::string closeTag = format("", tag->name); html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag); From 3d6673cb5ee3a307bd78b45636567dcd6b0a9069 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 8 Feb 2022 13:03:34 +0000 Subject: [PATCH 04/29] Make HTML tags case insensitive Tag case is retained in the output though. Well, for the opening tag at least. Closing tag always matches opening tag. --- src/tests/units/html_tests.cpp | 10 ++++++++++ src/translator/html.cpp | 27 +++++++++++++++++++-------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp index 8b3ac0f24..4ca48ecf7 100644 --- a/src/tests/units/html_tests.cpp +++ b/src/tests/units/html_tests.cpp @@ -165,6 +165,16 @@ TEST_CASE("Do not abort if the input is just empty element") { CHECK(response.target.text == "

"); } +TEST_CASE("Tag names are case insensitive") { + // Tests

vs

and
should be recognized as a void tag
. + // should be recognized as inline. + std::string test_str("

Space
please?

"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "Spa ce\n\nplease?"); +} + TEST_CASE("Test case html entities") { // These are all entities I would expect in innerHTML, since all other entities // can be encoded as UTF-8 so there's no need to encode them through &...; when diff --git a/src/translator/html.cpp b/src/translator/html.cpp index 8e8090a5a..cb8740ec1 100644 --- a/src/translator/html.cpp +++ b/src/translator/html.cpp @@ -1,5 +1,7 @@ #include "html.h" +#include + #include "response.h" #include "xh_scanner.h" @@ -46,6 +48,13 @@ size_t countPrefixWhitespaces(marian::string_view const &input) { return size; } +std::string toLowerCase(std::string_view const &input) { + std::string out; + out.resize(input.size()); + std::transform(input.begin(), input.end(), out.begin(), [](unsigned char c) { return std::tolower(c); }); + return out; +} + // Formatters used for exception messages combined with format() std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) { if (tag == nullptr) return out << "[nullptr]"; @@ -335,10 +344,11 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio } break; case markup::Scanner::TT_TAG_START: { - std::string name(scanner.tag()); + std::string name = toLowerCase(scanner.tag()); // Tag *tag is used by attribute parsing - tag = makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::move(name)}); + tag = + makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::string(scanner.tag())}); stack.push_back(tag); @@ -351,21 +361,22 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio } // Treat non-inline HTML tags as spaces that break up words. - if (!contains(options_.inlineTags, tag->name)) { + if (!contains(options_.inlineTags, name)) { addSentenceBreak = true; } else { addSpace = true; } } break; - case markup::Scanner::TT_TAG_END: + case markup::Scanner::TT_TAG_END: { + std::string tagName = toLowerCase(scanner.tag()); // If this is the closing bit of a void tag, i.e. triggered by the "/>" // bit of "", then completely ignore it. - if (contains(options_.voidTags, std::string(scanner.tag()))) break; + if (contains(options_.voidTags, tagName)) break; if (stack.empty()) throw BadHTML(format("Encountered more closing tags ({}) than opening tags", scanner.tag())); - if (stack.back()->name != scanner.tag()) + if (toLowerCase(stack.back()->name) != toLowerCase(scanner.tag())) throw BadHTML(format("Encountered unexpected closing tag , stack is {}", scanner.tag(), stack)); // What to do with "" case, where tag is immediately closed @@ -377,12 +388,12 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio stack.pop_back(); // Add space if necessary - if (!contains(options_.inlineTags, std::string(scanner.tag()))) { + if (!contains(options_.inlineTags, tagName)) { addSentenceBreak = true; } else { addSpace = true; } - break; + } break; case markup::Scanner::TT_ATTRIBUTE: assert(tag != nullptr); From 5634c40a53240a1debceec8dd6552f7afa13ade3 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 8 Feb 2022 13:04:44 +0000 Subject: [PATCH 05/29] Treat special Fixes #339 --- src/tests/units/html_tests.cpp | 16 ++++++++++++++++ src/translator/html.cpp | 4 ++-- src/translator/html.h | 3 +++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp index 4ca48ecf7..e50a60bbf 100644 --- a/src/tests/units/html_tests.cpp +++ b/src/tests/units/html_tests.cpp @@ -621,6 +621,22 @@ TEST_CASE("Test comment") { CHECK(response.target.text == test_str); } +TEST_CASE("Test element") { + std::string test_str("hello"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "hello"); +} + +TEST_CASE("Test element (case-insensitive)") { + std::string test_str("hello"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "hello"); +} + TEST_CASE("End-to-end translation", "[!mayfail]") { std::string input("

I like to drive this car.

"); HTML html(std::move(input), true); diff --git a/src/translator/html.cpp b/src/translator/html.cpp index cb8740ec1..520c98fbf 100644 --- a/src/translator/html.cpp +++ b/src/translator/html.cpp @@ -363,7 +363,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio // Treat non-inline HTML tags as spaces that break up words. if (!contains(options_.inlineTags, name)) { addSentenceBreak = true; - } else { + } else if (!contains(options_.inWordTags, name)) { addSpace = true; } } break; @@ -390,7 +390,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio // Add space if necessary if (!contains(options_.inlineTags, tagName)) { addSentenceBreak = true; - } else { + } else if (!contains(options_.inWordTags, tagName)) { addSpace = true; } } break; diff --git a/src/translator/html.h b/src/translator/html.h index 9939c51d6..8f30eb6d6 100644 --- a/src/translator/html.h +++ b/src/translator/html.h @@ -34,6 +34,9 @@ class HTML { "output", "q", "ruby", "small", "span", "strong", "sub", "sup", "time", "u", "var", "wbr", "ins", "del", "img"}; + // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr + std::unordered_set inWordTags{"wbr"}; + // List of characters that occur at the start of a token that indicate that // the this token is probably *not* a continuation of a word. Set to empty // to never mark a token as a continuation of the word. From e516dbdaba0522e496cfc80ecf15a2b79a2513a2 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 8 Feb 2022 16:42:32 +0000 Subject: [PATCH 06/29] Add support for ignoring tags Fixes #313 --- src/tests/units/html_tests.cpp | 8 ++++ src/translator/html.cpp | 83 ++++++++++++++++++++++++++++++++++ src/translator/html.h | 6 +++ 3 files changed, 97 insertions(+) diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp index e50a60bbf..ae6493c30 100644 --- a/src/tests/units/html_tests.cpp +++ b/src/tests/units/html_tests.cpp @@ -637,6 +637,14 @@ TEST_CASE("Test element (case-insensitive)") { CHECK(input == "hello"); } +TEST_CASE("Test ignored element") { + std::string test_str("hello this is nested var world"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "hello world"); +} + TEST_CASE("End-to-end translation", "[!mayfail]") { std::string input("

I like to drive this car.

"); HTML html(std::move(input), true); diff --git a/src/translator/html.cpp b/src/translator/html.cpp index 520c98fbf..279b68ef1 100644 --- a/src/translator/html.cpp +++ b/src/translator/html.cpp @@ -7,6 +7,7 @@ namespace { using marian::bergamot::AnnotatedText; +using marian::bergamot::BadHTML; using marian::bergamot::ByteRange; using marian::bergamot::HTML; using marian::bergamot::Response; @@ -284,6 +285,80 @@ size_t debugCountTokens(AnnotatedText const &text) { return tokens; } +// Helper function that consumes a tag as if it is a special tag, except that it +// takes nesting into account. I.e. `` will be consumed to the +// last ``. Assumes TT_TAG_START is already consumed, which was necessary +// to determine whether this was an element that needed to be ignored. +void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) { + // Only full elements can be consumed this way. With void tags we don't know + // where to stop scanning. All other types cannot be nested anyway. + assert(tag.type == HTML::Tag::ELEMENT); + + // TT_TAG_START is already consumed. + markup::Scanner::TokenType token; + size_t inside = 0; + + // Consume the full open tag, i.e. all its attributes + while (!inside) { + token = scanner.next(); + switch (token) { + case markup::Scanner::TT_ERROR: + throw BadHTML("HTML parse error"); + case markup::Scanner::TT_EOF: + throw BadHTML(format("Did not find closing tag ", name)); + case markup::Scanner::TT_ATTRIBUTE: + tag.attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value()); + break; + default: + // Not an attribute! Must be something inside the body or the closing + // tag already. Time to jump to the next loop. + ++inside; + break; + } + } + + // Last token was something that would have triggered Scanner::scanBody(), + // which sets value() to start pointing at the body. + const char *start = scanner.value().data(); + + while (inside) { + switch (token) { + case markup::Scanner::TT_ERROR: + throw BadHTML("HTML parse error"); + case markup::Scanner::TT_EOF: + throw BadHTML(format("Did not find closing tag ", name)); + case markup::Scanner::TT_TAG_START: + case markup::Scanner::TT_TAG_END: + // Note: Looking specifically for only our own type of tag so we don't + // have to care about whether other tags we encounter are void tags or + // not. Does assume the HTML is valid, as no stack is kept. + if (toLowerCase(scanner.tag()) == name) { + if (token == markup::Scanner::TT_TAG_END) { + if (--inside == 0) break; // also stops loop because !inside + } else { + ++inside; + } + } + // intentional fall-through to scanner.next()! + default: + token = scanner.next(); + break; + } + } + + // Only a TAG_END could have stopped the previous loop. If we know the name + // of that closing element, e.g. ``, we also know the position of + // where the body ended. 2 (`= start); + tag.data = std::string_view(start, end - start); +} + } // namespace namespace marian::bergamot { @@ -360,6 +435,14 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio stack.pop_back(); } + // Ignored tags have same semantics as void tags with regards to moving + // them around with the rest of the content. + if (contains(options_.ignoredTags, name)) { + consumeIgnoredTag(scanner, *tag, name); + spans_.push_back(Span{source.size(), source.size(), stack}); + stack.pop_back(); + } + // Treat non-inline HTML tags as spaces that break up words. if (!contains(options_.inlineTags, name)) { addSentenceBreak = true; diff --git a/src/translator/html.h b/src/translator/html.h index 8f30eb6d6..467aafa9a 100644 --- a/src/translator/html.h +++ b/src/translator/html.h @@ -37,6 +37,12 @@ class HTML { // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr std::unordered_set inWordTags{"wbr"}; + // List of elements we copy as is, but do parse as if they're HTML because + // they could be nested. For because + // the script tag may not be nested, but that is not the case for these + // elements per se. + std::unordered_set ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math", "textarea"}; + // List of characters that occur at the start of a token that indicate that // the this token is probably *not* a continuation of a word. Set to empty // to never mark a token as a continuation of the word. From 46159ba8180c041baf725e38b7bdfb8aea0d5ac2 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Wed, 9 Feb 2022 11:09:57 +0000 Subject: [PATCH 07/29] Add test for regression in ignored element code path std::bad_alloc :( Also expand tests to make sure we're recording the full ignored tag contents. --- src/tests/units/html_tests.cpp | 48 +++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp index ae6493c30..0329ef436 100644 --- a/src/tests/units/html_tests.cpp +++ b/src/tests/units/html_tests.cpp @@ -637,12 +637,54 @@ TEST_CASE("Test element (case-insensitive)") { CHECK(input == "hello"); } -TEST_CASE("Test ignored element") { - std::string test_str("hello this is nested var world"); +TEST_CASE("Test ignored element (nested)") { + std::string test_str("foo nested bar"); + std::string expected_str("foo nestedbar"); std::string input(test_str); HTML html(std::move(input), true); - CHECK(input == "hello world"); + CHECK(input == "foo bar"); + + Response response; + std::string sentence_str("foo bar"); + std::vector sentence{ + string_view(sentence_str.data() + 0, 3), // foo + string_view(sentence_str.data() + 3, 1), // _ + string_view(sentence_str.data() + 4, 4), // _bar + string_view(sentence_str.data() + 8, 0), // "" + }; + response.source.appendSentence("", sentence.begin(), sentence.end()); + response.target.appendSentence("", sentence.begin(), sentence.end()); + response.alignments = {identity_matrix(4)}; + + html.restore(response); + CHECK(response.source.text == expected_str); + CHECK(response.target.text == expected_str); +} + +TEST_CASE("Test ignored element (with entity)") { + std::string test_str("foo & bar"); + std::string expected_str("foo &bar"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "foo bar"); + + Response response; + std::string sentence_str("foo bar"); + std::vector sentence{ + string_view(sentence_str.data() + 0, 3), // foo + string_view(sentence_str.data() + 3, 1), // _ + string_view(sentence_str.data() + 4, 4), // _bar + string_view(sentence_str.data() + 8, 0), // "" + }; + response.source.appendSentence("", sentence.begin(), sentence.end()); + response.target.appendSentence("", sentence.begin(), sentence.end()); + response.alignments = {identity_matrix(4)}; + + html.restore(response); + CHECK(response.source.text == expected_str); + CHECK(response.target.text == expected_str); } TEST_CASE("End-to-end translation", "[!mayfail]") { From af39c75d1a44579fd025605467a28ab46c88bc25 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Wed, 9 Feb 2022 12:38:50 +0000 Subject: [PATCH 08/29] Fix bad_alloc in consumeIgnoredTag Trouble was that `Scanner::scanEntity()` returns a value() that does not point to inside the HTML input stream (but to a *decoded* entity instead). So we need another API, `Scanner::start()`, to figure out where a token starts in HTML. --- src/translator/html.cpp | 10 +++++----- src/translator/xh_scanner.cpp | 10 ++++++++++ src/translator/xh_scanner.h | 6 ++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/translator/html.cpp b/src/translator/html.cpp index 279b68ef1..2af2030ee 100644 --- a/src/translator/html.cpp +++ b/src/translator/html.cpp @@ -319,8 +319,9 @@ void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string con // Last token was something that would have triggered Scanner::scanBody(), // which sets value() to start pointing at the body. - const char *start = scanner.value().data(); + const char *start = scanner.start(); + // Consume the rest of the HTML until (including) the final closing tag. while (inside) { switch (token) { case markup::Scanner::TT_ERROR: @@ -346,11 +347,10 @@ void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string con } } - // Only a TAG_END could have stopped the previous loop. If we know the name - // of that closing element, e.g. ``, we also know the position of - // where the body ended. 2 (`" scanFun_ = &Scanner::scanBody; gotTail_ = false; return TT_COMMENT_END; } + start_ = input_.pos(); value_ = string_ref{input_.pos(), 0}; while (true) { @@ -334,11 +340,13 @@ Scanner::TokenType Scanner::scanComment() { Scanner::TokenType Scanner::scanProcessingInstruction() { if (gotTail_) { + start_ = input_.pos() - 2; scanFun_ = &Scanner::scanBody; gotTail_ = false; return TT_PROCESSING_INSTRUCTION_END; } + start_ = input_.pos(); value_ = string_ref{input_.pos(), 0}; while (true) { @@ -356,11 +364,13 @@ Scanner::TokenType Scanner::scanProcessingInstruction() { Scanner::TokenType Scanner::scanSpecial() { if (gotTail_) { + start_ = input_.pos() - (tagName_.size + 3); scanFun_ = &Scanner::scanBody; gotTail_ = false; return TT_TAG_END; } + start_ = input_.pos(); value_ = string_ref{input_.pos(), 0}; while (true) { diff --git a/src/translator/xh_scanner.h b/src/translator/xh_scanner.h index 14d755bbd..530df675d 100644 --- a/src/translator/xh_scanner.h +++ b/src/translator/xh_scanner.h @@ -83,6 +83,7 @@ class Scanner { tagName_{nullptr, 0}, attributeName_{nullptr, 0}, input_(is), + start_(nullptr), scanFun_(&Scanner::scanBody), gotTail_(false) {} @@ -98,6 +99,8 @@ class Scanner { // get tag name std::string_view tag() const; + inline const char *start() const { return start_; } + private: /* methods */ typedef TokenType (Scanner::*ScanPtr)(); @@ -137,6 +140,9 @@ class Scanner { instream &input_; + // Start position of a token. + const char *start_; + bool gotTail_; // aux flag used in scanComment, scanSpecial, scanProcessingInstruction }; } // namespace markup From f595c5189053232672811d16572659b63062edea Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Fri, 11 Feb 2022 12:11:41 +0000 Subject: [PATCH 09/29] Prevent straggler void elements to show up twice When a word near the of a translated sentence aligns with one at the beginning, it pushes prevIt back to the beginning. Then the next translated token will insert all straggler void elements between prevIt and it. Instead of using prevIt to track where we were with inserting stragglers, we keep our own iterator that never moves backwards. --- src/translator/html.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/translator/html.cpp b/src/translator/html.cpp index 2af2030ee..6f0dca942 100644 --- a/src/translator/html.cpp +++ b/src/translator/html.cpp @@ -606,27 +606,28 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector const &targetTokenSpans) { auto prevSpan = spans_.cbegin(); auto targetSpanIt = targetTokenSpans.begin(); + auto straggerSpanIt = spans_.cbegin(); AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) { TokenFormatter formatter(token); // First we scan through spans_ to catch up to the span assigned to this // token. We're only interested in empty spans (empty and void elements) - for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) { + for (; straggerSpanIt < *targetSpanIt; ++straggerSpanIt) { // We're only interested in empty spans or spans that would otherwise get // lost because they didn't align with anything between the spans in // targetSpanIt // TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD - if (span_it->size() != 0 && - std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end()) + if (straggerSpanIt->size() != 0 && + std::find(targetTokenSpans.begin(), targetTokenSpans.end(), straggerSpanIt) != targetTokenSpans.end()) continue; - formatter.append(prevSpan->tags, span_it->tags); + formatter.append(prevSpan->tags, straggerSpanIt->tags); // Note: here, not in 3rd part of for-statement because we don't want to // set prevSpan if the continue clause at the beginning of this for-loop // was hit. - prevSpan = span_it; + prevSpan = straggerSpanIt; } // Now do the same thing but for our target set of tags. Note that we cannot From 32f403ab36e3a9e23ab59d8f835773d5d52636f4 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Fri, 11 Feb 2022 23:09:03 +0000 Subject: [PATCH 10/29] Use isContinuation function to check whether we need to insert a space after a tag Main reason for using this instead of `std::isspace` is to prevent a space being inserted between the tag and the full stop in `This is a test.`. Because that has been bothering me a lot. --- src/translator/html.cpp | 9 ++++++--- src/translator/html.h | 5 ++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/translator/html.cpp b/src/translator/html.cpp index 6f0dca942..7d662345f 100644 --- a/src/translator/html.cpp +++ b/src/translator/html.cpp @@ -406,8 +406,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio // If the previous segment was an open or close tag, it might be best // to add a space to make sure we don't append to the previous word. if (addSpace) { - if (options_.substituteInlineTagsWithSpaces && !source.empty() && !std::isspace(source.back()) && - !std::isspace(scanner.value()[0])) { + if (options_.substituteInlineTagsWithSpaces && isContinuation(source, scanner.value())) { source.push_back(' '); } addSpace = false; @@ -688,13 +687,17 @@ void HTML::copyTaint(Response const &response, std::vector> // to determine whether we should share the markup, or whether we should see // this token as a fresh start. This implementation will treat "hello[world]" // as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`. -bool HTML::isContinuation(string_view prev, string_view str) { +bool HTML::isContinuation(std::string_view prev, std::string_view str) { if (options_.continuationDelimiters.empty()) return false; if (prev.empty() || str.empty()) return false; return options_.continuationDelimiters.find(str[0]) == std::string::npos && options_.continuationDelimiters.find(prev.back()) == std::string::npos; } +bool HTML::isContinuation(marian::string_view prev, marian::string_view str) { + return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size())); +} + void HTML::hardAlignments(Response const &response, std::vector> &alignments, std::vector const &sourceTokenSpans) { size_t offset = 0; diff --git a/src/translator/html.h b/src/translator/html.h index 467aafa9a..0bbaf05cd 100644 --- a/src/translator/html.h +++ b/src/translator/html.h @@ -4,9 +4,11 @@ #include #include #include +#include #include #include "annotation.h" +#include "data/types.h" #include "definitions.h" namespace marian { @@ -96,7 +98,8 @@ class HTML { std::vector &targetTokenSpans); void hardAlignments(Response const &response, std::vector> &alignments, std::vector const &sourceTokenSpans); - bool isContinuation(string_view prev, string_view str); + bool isContinuation(marian::string_view prev, marian::string_view str); + bool isContinuation(std::string_view prev, std::string_view str); // Allocates tag in pool_ (which then owns it) and gives a pointer to be used // in Taints. Pointer is valid as long as this HTML instance lives on. Tag *makeTag(Tag &&tag); From 72e54f82bfe948fd46ad72ee72bce6d3ec0b886c Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Mon, 14 Feb 2022 13:41:59 +0000 Subject: [PATCH 11/29] Treat more elements as opaque when parsing These are all elements that Firefox treats as opaque in their HTML5 parser. As a consequence, when you'd request `noscriptElement.innerHTML` you'd get the raw text content of the thing, as opposed to a serialized tree. So invalid HTML? Just passed on as is! Well, we're going to do the same then. Besides, if noscript then also probably no extension. --- src/translator/html.h | 2 +- src/translator/xh_scanner.cpp | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/translator/html.h b/src/translator/html.h index f4f7d88f2..8c18272d9 100644 --- a/src/translator/html.h +++ b/src/translator/html.h @@ -38,7 +38,7 @@ class HTML { // they could be nested. For because // the script tag may not be nested, but that is not the case for these // elements per se. - std::unordered_set ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math", "textarea"}; + std::unordered_set ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"}; // List of characters that occur at the start of a token that indicate that // the this token is probably *not* a continuation of a word. Set to empty diff --git a/src/translator/xh_scanner.cpp b/src/translator/xh_scanner.cpp index 050f0acf9..a62f05f25 100644 --- a/src/translator/xh_scanner.cpp +++ b/src/translator/xh_scanner.cpp @@ -99,15 +99,16 @@ Scanner::TokenType Scanner::scanAttribute() { switch (input_.peek()) { case '>': input_.consume(); - if (equalsCaseInsensitive(tagName_, "script")) { + + // Treat some elements as opaque, e.g. because // the script tag may not be nested, but that is not the case for these - // elements per se. - std::unordered_set ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"}; + // elements per se. Some tags, like because - // the script tag may not be nested, but that is not the case for these - // elements per se. Some tags, like because + /// the script tag may not be nested, but that is not the case for these + /// elements per se. Some tags, like