diff --git a/bergamot-translator-tests b/bergamot-translator-tests index 3c0f95a17..3776609ce 160000 --- a/bergamot-translator-tests +++ b/bergamot-translator-tests @@ -1 +1 @@ -Subproject commit 3c0f95a1775a74f5db441aa2f17ceb7437679022 +Subproject commit 3776609ce5f7a238245e303efaa007b2d5078180 diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp index 2b71784fb..96eff5aad 100644 --- a/src/tests/units/html_tests.cpp +++ b/src/tests/units/html_tests.cpp @@ -172,6 +172,16 @@ TEST_CASE("Do not abort if the input is just empty element") { CHECK(response.target.text == "

"); } +TEST_CASE("Tag names are case insensitive") { + // Tests

vs

and
should be recognized as a void tag
. + // should be recognized as inline. + std::string test_str("

Space
please?

"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "Spa ce\n\nplease?"); +} + TEST_CASE("Test case html entities") { // These are all entities I would expect in innerHTML, since all other entities // can be encoded as UTF-8 so there's no need to encode them through &...; when @@ -618,6 +628,72 @@ TEST_CASE("Test comment") { CHECK(response.target.text == test_str); } +TEST_CASE("Test element") { + std::string test_str("hello"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "hello"); +} + +TEST_CASE("Test element (case-insensitive)") { + std::string test_str("hello"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "hello"); +} + +TEST_CASE("Test ignored element (nested)") { + std::string test_str("foo nested bar"); + std::string expected_str("foo nestedbar"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "foo bar"); + + Response response; + std::string sentence_str("foo bar"); + std::vector sentence{ + string_view(sentence_str.data() + 0, 3), // foo + string_view(sentence_str.data() + 3, 1), // _ + string_view(sentence_str.data() + 4, 4), // _bar + string_view(sentence_str.data() + 8, 0), // "" + }; + response.source.appendSentence("", sentence.begin(), sentence.end()); + response.target.appendSentence("", sentence.begin(), sentence.end()); + response.alignments = {identity_matrix(4)}; + + html.restore(response); + CHECK(response.source.text == expected_str); + CHECK(response.target.text == expected_str); +} + +TEST_CASE("Test ignored element (with entity)") { + std::string test_str("foo & bar"); + std::string expected_str("foo &bar"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "foo bar"); + + Response response; + std::string sentence_str("foo bar"); + std::vector sentence{ + string_view(sentence_str.data() + 0, 3), // foo + string_view(sentence_str.data() + 3, 1), // _ + string_view(sentence_str.data() + 4, 4), // _bar + string_view(sentence_str.data() + 8, 0), // "" + }; + response.source.appendSentence("", sentence.begin(), sentence.end()); + response.target.appendSentence("", sentence.begin(), sentence.end()); + response.alignments = {identity_matrix(4)}; + + html.restore(response); + CHECK(response.source.text == expected_str); + CHECK(response.target.text == expected_str); +} + TEST_CASE("End-to-end translation", "[!mayfail]") { std::string input("

I like to drive this car.

"); HTML html(std::move(input), true); diff --git a/src/translator/annotation.h b/src/translator/annotation.h index 785d49dfe..5a17dfcfe 100644 --- a/src/translator/annotation.h +++ b/src/translator/annotation.h @@ -185,6 +185,41 @@ struct AnnotatedText { /// Returns a ByteRange representing sentence corresponding to sentenceIdx. ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); } + /// Utility function to call `fun` on each word (subword token effectively) in + /// an `AnnotatedText`. `fun` is called with the `ByteRange`, the `string_view` + /// with the word, and a `bool` to indicate whether it is the last word in the + /// `AnnotatedText`, which is also the ending whitespace slot of AnnotatedText. + template + AnnotatedText apply(Fun fun) const { + AnnotatedText out; + + for (size_t sentenceIdx = 0; sentenceIdx < numSentences(); ++sentenceIdx) { + std::string sentence; + std::vector tokens; + + std::string prefix = fun(annotation.gap(sentenceIdx), gap(sentenceIdx), false); + + for (size_t wordIdx = 0; wordIdx < numWords(sentenceIdx); ++wordIdx) { + std::string token = fun(wordAsByteRange(sentenceIdx, wordIdx), word(sentenceIdx, wordIdx), false); + tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()}); + sentence += token; + } + + // Convert our ByteRanges to string_views since that's what appendSentence + // expects + std::vector views(tokens.size()); + std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) { + return marian::string_view(sentence.data() + range.begin, range.size()); + }); + + out.appendSentence(prefix, views.begin(), views.end()); + } + + out.appendEndingWhitespace(fun(annotation.gap(numSentences()), gap(numSentences()), true)); + + return out; + } + private: string_view asStringView(const ByteRange &byteRange) const { return string_view(text.data() + byteRange.begin, byteRange.size()); diff --git a/src/translator/html.cpp b/src/translator/html.cpp index 242572db0..ed42b9117 100644 --- a/src/translator/html.cpp +++ b/src/translator/html.cpp @@ -1,21 +1,23 @@ #include "html.h" +#include + #include "response.h" #include "xh_scanner.h" namespace { -using marian::string_view; using marian::bergamot::AnnotatedText; using marian::bergamot::ByteRange; using marian::bergamot::HTML; using marian::bergamot::Response; -void encodeEntities(string_view const &input, std::string &output) { +/// Encodes the minimum of HTML entities. +void encodeEntities(marian::string_view const &input, std::string &output) { output.clear(); output.reserve(input.size()); // assumes there are no entities in most cases - for (auto it = input.begin(); it != input.end(); ++it) { - switch (*it) { + for (char it : input) { + switch (it) { case '&': output.append("&"); break; @@ -35,19 +37,30 @@ void encodeEntities(string_view const &input, std::string &output) { // output.append("'"); // break; default: - output.push_back(*it); + output.push_back(it); break; } } } -size_t countPrefixWhitespaces(string_view const &input) { +/// Counts number of whitespace characters at the start of the input. Used +/// for determining where to insert an open or close tag. +size_t countPrefixWhitespaces(marian::string_view const &input) { size_t size = 0; while (size < input.size() && std::isspace(input[size])) ++size; return size; } -// Very simple replacement for std::format introduced in C++20 +std::string toLowerCase(std::string_view const &input) { + std::string out; + out.resize(input.size()); + std::transform(input.begin(), input.end(), out.begin(), [](unsigned char c) { return std::tolower(c); }); + return out; +} + +/// Very simple replacement for std::format introduced in C++20. Only supports +/// replacing `{}` in the template string with whatever `operator<<` for that +/// type turns it into. std::string format(std::string const &formatTemplate) { return formatTemplate; } template @@ -68,14 +81,14 @@ std::string format(std::string const &formatTemplate, Arg arg, Args... args) { return os.str(); } -// Syntactic sugar around rbegin() and rend() that allows me to write -// `for (auto &&item : reversed(container))` instead of the needlessly verbose -// `for (auto it = container.rbegin(); it != container.rend(); ++it)` +/// Syntactic sugar around rbegin() and rend() that allows me to write +/// `for (auto &&item : reversed(container))` instead of the needlessly verbose +/// `for (auto it = container.rbegin(); it != container.rend(); ++it)` template -class reversed { +class Reversed { public: - typedef typename T::const_reverse_iterator iterator; - explicit reversed(T const &container) : container_(container){}; + using iterator = typename T::const_reverse_iterator; + explicit Reversed(T const &container) : container_(container){}; iterator begin() const { return container_.rbegin(); } iterator end() const { return container_.rend(); } @@ -83,11 +96,10 @@ class reversed { T const &container_; }; -bool contains(std::unordered_set const &set, std::string const &name) { - return set.find(name) != set.end(); -} - -void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &opening, HTML::Taint &closing) { +/// When comparing two tag stacks, determine which tags need to be closed and +/// opened to get from one stack to the other. +void diffTags(HTML::TagStack const &prev, HTML::TagStack const &curr, HTML::TagStack &opening, + HTML::TagStack &closing) { opening.clear(); closing.clear(); @@ -98,9 +110,11 @@ void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &ope if (i >= curr.size() || prev[i] != curr[i]) break; // Only nodes of type ELEMENT can have children and thus would need a closing tag. + // NOLINTNEXTLINE(bugprone-narrowing-conversions) std::copy_if(prev.begin() + i, prev.end(), std::back_inserter(closing), [&](HTML::Tag *tag) { return tag->type == HTML::Tag::ELEMENT; }); + // NOLINTNEXTLINE(bugprone-narrowing-conversions) opening.insert(opening.end(), curr.begin() + i, curr.end()); } @@ -108,42 +122,24 @@ bool intersects(ByteRange const &range, HTML::Span const &span) { return range.begin <= span.end && range.end >= span.begin; }; -bool containsTag(HTML::Taint const &stack, HTML::Tag const *tag) { +bool contains(HTML::TagNameSet const &set, std::string_view const &name) { return set.find(name) != set.end(); } + +bool contains(HTML::TagStack const &stack, HTML::Tag const *tag) { return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend(); } -template -AnnotatedText apply(AnnotatedText const &in, Fun fun) { - AnnotatedText out; - - for (size_t sentenceIdx = 0; sentenceIdx < in.numSentences(); ++sentenceIdx) { - std::string sentence; - std::vector tokens; +/// Is tag stack B an extended version of A? I.e. same tags, but maybe a few +/// more nested deeper. +bool extends(HTML::TagStack const &b, HTML::TagStack const &a) { + if (a.size() > b.size()) return false; - std::string prefix = fun(in.annotation.gap(sentenceIdx), in.gap(sentenceIdx), false); + for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j) + if (*i != *j) return false; - for (size_t wordIdx = 0; wordIdx < in.numWords(sentenceIdx); ++wordIdx) { - std::string token = fun(in.wordAsByteRange(sentenceIdx, wordIdx), in.word(sentenceIdx, wordIdx), false); - tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()}); - sentence += token; - } - - // Convert our ByteRanges to string_views since that's what appendSentence - // expects - // TODO: extend AnnotatedText::appendSentence to accept str + ByteRanges - // directly - std::vector views(tokens.size()); - std::transform(tokens.begin(), tokens.end(), views.begin(), - [&](ByteRange const &range) { return string_view(sentence.data() + range.begin, range.size()); }); - - out.appendSentence(prefix, views.begin(), views.end()); - } - - out.appendEndingWhitespace(fun(in.annotation.gap(in.numSentences()), in.gap(in.numSentences()), true)); - - return out; + return true; } +/// Tests whether `response` has alignment info associated with it or not. bool hasAlignments(Response const &response) { // Test for each sentence individually as a sentence may be empty (or there) // might be no sentences, so just testing for alignments.empty() would not be @@ -162,11 +158,12 @@ bool hasAlignments(Response const &response) { return true; } -// Little helper class to append HTML to a token +/// Helper class to append HTML tags to a token. Also makes sure the token is +/// encoded as valid HTML. class TokenFormatter { public: - explicit TokenFormatter(string_view token) - : html_(), offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) { + explicit TokenFormatter(marian::string_view token) + : offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) { // Do encoding of any entities that popped up in the translation encodeEntities(token, html_); } @@ -174,12 +171,12 @@ class TokenFormatter { std::string &&html() { return std::move(html_); } // Append the markup necessary for moving from `prev` set of tags to `curr`. - void append(HTML::Taint const &prev, HTML::Taint const &curr) { - HTML::Taint opening, closing; + void append(HTML::TagStack const &prev, HTML::TagStack const &curr) { + HTML::TagStack opening, closing; diffTags(prev, curr, opening, closing); - for (HTML::Tag const *tag : reversed(closing)) { + for (HTML::Tag const *tag : Reversed(closing)) { assert(tag->type == HTML::Tag::ELEMENT); std::string closeTag = format("", tag->name); html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag); @@ -232,6 +229,8 @@ class TokenFormatter { bool closeLeft_; }; +/// Count the number of tokens in an AnnotatedText. Used to assert we're not +/// running out of sync when creating vectors that describe each token. size_t debugCountTokens(AnnotatedText const &text) { size_t tokens = 1; // for the ending gap for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) { @@ -240,11 +239,87 @@ size_t debugCountTokens(AnnotatedText const &text) { return tokens; } +/// Helper function that consumes a tag as if it is a special tag, except that +/// it takes nesting into account. I.e. `` will be consumed to the +// last ``. Assumes TT_TAG_START is already consumed, which was necessary +/// to determine whether this was an element that needed to be ignored. +void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) { + // Only full elements can be consumed this way. With void tags we don't know + // where to stop scanning. All other types cannot be nested anyway. + assert(tag.type == HTML::Tag::ELEMENT); + + // TT_TAG_START is already consumed. + markup::Scanner::TokenType token; + size_t inside = 0; + + // Consume the full open tag, i.e. all its attributes + while (!inside) { + token = scanner.next(); + switch (token) { + case markup::Scanner::TT_ERROR: + ABORT("HTML parse error"); + case markup::Scanner::TT_EOF: + ABORT("Did not find closing tag ", name); + case markup::Scanner::TT_ATTRIBUTE: + tag.attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value()); + break; + default: + // Not an attribute! Must be something inside the body or the closing + // tag already. Time to jump to the next loop. + ++inside; + break; + } + } + + // Last token was something that would have triggered Scanner::scanBody(), + // which sets value() to start pointing at the body. + const char *start = scanner.start(); + + // Consume the rest of the HTML until (including) the final closing tag. We + // start with the token that caused the previous loop to fall into the default + // case. + while (inside) { + switch (token) { + case markup::Scanner::TT_ERROR: + ABORT("HTML parse error"); + case markup::Scanner::TT_EOF: + ABORT("Did not find closing tag "); + case markup::Scanner::TT_TAG_START: + // Note: Looking specifically for only our own type of tag so we don't + // have to care about whether other tags we encounter are void tags or + // not. Does assume the HTML is valid, as no stack is kept. + if (toLowerCase(scanner.tag()) == name) ++inside; + break; + case markup::Scanner::TT_TAG_END: + if (toLowerCase(scanner.tag()) == name) --inside; + break; + default: + break; + } + + // Only continue scanning if we're still inside. We could have just read the + // TT_TAG_END token that ended this element, and we don't want to continue + // consuming tokens at that point. + if (inside) token = scanner.next(); + } + + // Only a TAG_END could have stopped the previous loop. We take the start + // of the final closing tag as the end of our data. + assert(token == markup::Scanner::TT_TAG_END); + const char *end = scanner.start(); + + // All data between the end of the first open element, and the start of the + // last close element, we just treat as raw data that will be printed when + // this tag is eventually printed. + assert(end >= start); + tag.data = std::string_view(start, end - start); +} + } // namespace namespace marian::bergamot { -// Formatters used for exception messages combined with format() +/// Formatters used for formatting error messages in ABORT() calls. std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) { if (tag == nullptr) return out << "[nullptr]"; switch (tag->type) { @@ -262,7 +337,7 @@ std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) { return out << "[Unknown tag type]"; } -std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) { +std::ostream &operator<<(std::ostream &out, HTML::TagStack const &tags) { for (auto it = tags.begin(); it != tags.end(); ++it) { if (it != tags.begin()) out << ' '; out << *it; @@ -270,18 +345,20 @@ std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) { return out; } -HTML::HTML(std::string &&source, bool process_markup, Options &&options) : options_(std::move(options)) { - if (!process_markup) return; +HTML::HTML(std::string &&source, bool processMarkup, Options &&options) : options_(std::move(options)) { + if (!processMarkup) return; std::string original = std::move(source); markup::instream in(original.data(), original.data() + original.size()); markup::Scanner scanner(in); source.clear(); // source is moved out of, so should be clear anyway - Tag *tag; - Taint stack; - bool addSentenceBreak = false; - bool addSpace = false; + Tag *tag = nullptr; // current tag (after opening at least) + TagStack stack; // stack of currently open tags + bool addSentenceBreak = false; // whether to add a sentence break next text segment + bool addWordBreak = false; // whether to add a word break next text segment + + // Starting point: an empty span with no open tags. spans_.push_back(Span{0, 0, {}}); bool stop = false; @@ -298,13 +375,14 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio // If the previous segment was the open or close tag of a block element // we treat the text after it as a new sentence. if (addSentenceBreak) { - if (!(source.empty() || (source.size() > 2 && source.substr(source.size() - 2) == ""))) { + // If there isn't already a \n\n at the end of source... + if (source.size() >= 2 && source.substr(source.size() - 2) != "\n\n") { stack.push_back(makeTag({Tag::WHITESPACE})); // Important: span->size() == 0 to make it behave as a void element. // Also important: position before the \n\n tokens, not after, to // make it easier to remove them later through apply(). spans_.push_back(Span{source.size(), source.size(), stack}); - source.append("\n\n"); // TODO assumes ssplit-mode = wrapped_text + source.append("\n\n"); // Should work with ssplit-mode = wrapped_text stack.pop_back(); } addSentenceBreak = false; @@ -312,24 +390,27 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio // If the previous segment was an open or close tag, it might be best // to add a space to make sure we don't append to the previous word. - if (addSpace) { - if (options_.substituteInlineTagsWithSpaces && !source.empty() && !std::isspace(source.back()) && - !std::isspace(scanner.value()[0])) { + if (addWordBreak) { + // Only add the space when it would be inside a word. Do not add it if + // it would be between a word and punctuation. + if (options_.substituteInlineTagsWithSpaces && isContinuation(source, scanner.value())) { source.push_back(' '); } - addSpace = false; + addWordBreak = false; } + // Store which tags were open when this span of text was encountered. auto begin = source.size(); source.append(scanner.value()); spans_.push_back(Span{begin, source.size(), stack}); } break; case markup::Scanner::TT_TAG_START: { - std::string name(scanner.tag()); + std::string name = toLowerCase(scanner.tag()); // Tag *tag is used by attribute parsing - tag = makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::move(name)}); + auto type = contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT; + tag = makeTag({type, std::string(scanner.tag())}); stack.push_back(tag); @@ -341,39 +422,48 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio stack.pop_back(); } + // Ignored tags have same semantics as void tags with regards to moving + // them around with the rest of the content. + if (contains(options_.ignoredTags, name)) { + consumeIgnoredTag(scanner, *tag, name); + spans_.push_back(Span{source.size(), source.size(), stack}); + stack.pop_back(); + } + // Treat non-inline HTML tags as spaces that break up words. - if (!contains(options_.inlineTags, tag->name)) { + if (!contains(options_.inlineTags, name)) { addSentenceBreak = true; - } else { - addSpace = true; + } else if (!contains(options_.inWordTags, name)) { + addWordBreak = true; } } break; - case markup::Scanner::TT_TAG_END: + case markup::Scanner::TT_TAG_END: { + std::string tagName = toLowerCase(scanner.tag()); // If this is the closing bit of a void tag, i.e. triggered by the "/>" // bit of "", then completely ignore it. - if (contains(options_.voidTags, std::string(scanner.tag()))) break; + if (contains(options_.voidTags, tagName)) break; ABORT_IF(stack.empty(), "Encountered more closing tags ({}) than opening tags", scanner.tag()); - ABORT_IF(stack.back()->name != scanner.tag(), "Encountered unexpected closing tag , stack is {}", - scanner.tag(), stack); + ABORT_IF(toLowerCase(stack.back()->name) != toLowerCase(scanner.tag()), + "Encountered unexpected closing tag , stack is {}", scanner.tag(), stack); // What to do with "" case, where tag is immediately closed // so it never makes it into the taint of any of the spans? This adds // an empty span so it still gets recorded in spans_. - if (spans_.empty() || !containsTag(spans_.back().tags, stack.back())) + if (spans_.empty() || !contains(spans_.back().tags, stack.back())) spans_.push_back(Span{source.size(), source.size(), stack}); stack.pop_back(); // Add space if necessary - if (!contains(options_.inlineTags, std::string(scanner.tag()))) { + if (!contains(options_.inlineTags, tagName)) { addSentenceBreak = true; - } else { - addSpace = true; + } else if (!contains(options_.inWordTags, tagName)) { + addWordBreak = true; } - break; + } break; case markup::Scanner::TT_ATTRIBUTE: assert(tag != nullptr); @@ -448,10 +538,10 @@ void HTML::restore(Response &response) { // Find for every token in target the token in source that best matches. std::vector> alignments; - hardAlignments(response, alignments); + hardAlignments(response, alignments, sourceTokenSpans); std::vector targetTokenSpans; - copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans); + copyTagStack(response, alignments, sourceTokenSpans, targetTokenSpans); assert(targetTokenSpans.size() == debugCountTokens(response.target)); AnnotatedText target = restoreTarget(response.target, targetTokenSpans); @@ -466,7 +556,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vectortags.empty()); - return apply(in, [&](ByteRange range, string_view token, bool last) { + return in.apply([&](ByteRange range, string_view token, bool last) { TokenFormatter formatter(token); // Potential issue: spans and tokens can intersect, e.g. @@ -475,9 +565,11 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector, others only

) // tokens |111111111111111|2| // - // Now 1 covers span 1 to 3, so what taint should it get? Just

, or

? - // Note: only relevant if isBlockElement is used. If we just insert spaces - // around all elements, every segment of `hello` will be a token. + // Now 1 covers span 1 to 3, so what taint should it get? Just `

`, or + // `

`? + // Note: only relevant if `substituteInlineTagsWithSpaces` is true. If we + // just insert spaces around all elements, every segment of `hello` will be + // a token. // Seek to the last span that overlaps with this token while (true) { @@ -494,7 +586,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector const &targetTokenSpans) { auto prevSpan = spans_.cbegin(); auto targetSpanIt = targetTokenSpans.begin(); + auto straggerSpanIt = spans_.cbegin(); - AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) { + AnnotatedText out = in.apply([&]([[maybe_unused]] ByteRange range, string_view token, bool last) { TokenFormatter formatter(token); // First we scan through spans_ to catch up to the span assigned to this // token. We're only interested in empty spans (empty and void elements) - for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) { + for (; straggerSpanIt < *targetSpanIt; ++straggerSpanIt) { // We're only interested in empty spans or spans that would otherwise get // lost because they didn't align with anything between the spans in // targetSpanIt // TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD - if (span_it->size() != 0 && - std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end()) + if (straggerSpanIt->size() != 0 && + std::find(targetTokenSpans.begin(), targetTokenSpans.end(), straggerSpanIt) != targetTokenSpans.end()) continue; - formatter.append(prevSpan->tags, span_it->tags); + formatter.append(prevSpan->tags, straggerSpanIt->tags); // Note: here, not in 3rd part of for-statement because we don't want to // set prevSpan if the continue clause at the beginning of this for-loop // was hit. - prevSpan = span_it; + prevSpan = straggerSpanIt; } // Now do the same thing but for our target set of tags. Note that we cannot @@ -539,7 +632,7 @@ AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vectortags.empty()); - formatter.append((*targetSpanIt)->tags, HTML::Taint()); + formatter.append((*targetSpanIt)->tags, HTML::TagStack()); } prevSpan = *targetSpanIt; @@ -559,8 +652,9 @@ HTML::Tag *HTML::makeTag(Tag &&tag) { return &pool_.front(); } -void HTML::copyTaint(Response const &response, std::vector> const &alignments, - std::vector const &sourceTokenSpans, std::vector &targetTokenSpans) { +void HTML::copyTagStack(Response const &response, std::vector> const &alignments, + std::vector const &sourceTokenSpans, + std::vector &targetTokenSpans) { size_t offset = 0; // Sentence offset in sourceTokenSpans // Fill targetTokenSpans based on the alignments we just made up. @@ -584,14 +678,25 @@ void HTML::copyTaint(Response const &response, std::vector> // to determine whether we should share the markup, or whether we should see // this token as a fresh start. This implementation will treat "hello[world]" // as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`. -bool HTML::isContinuation(string_view prev, string_view str) { +bool HTML::isContinuation(std::string_view prev, std::string_view str) const { if (options_.continuationDelimiters.empty()) return false; if (prev.empty() || str.empty()) return false; return options_.continuationDelimiters.find(str[0]) == std::string::npos && options_.continuationDelimiters.find(prev.back()) == std::string::npos; } -void HTML::hardAlignments(Response const &response, std::vector> &alignments) { +bool HTML::isContinuation(marian::string_view prev, marian::string_view str) const { + return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size())); +} + +/// Selects for each token in `response.target` a best source token from +/// `response.source` and writes this selection to `alignments`. The source +/// token spans are used to also look at the markup applied to each token to +/// figure out which source token best represents each target token. +void HTML::hardAlignments(Response const &response, std::vector> &alignments, + std::vector const &sourceTokenSpans) { + size_t offset = 0; // sentence offset in sourceTokenSpans + // For each sentence... for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) { alignments.emplace_back(); @@ -600,14 +705,9 @@ void HTML::hardAlignments(Response const &response, std::vector response.alignments[sentenceIdx][t][maxS]) { - maxS = s; - } - } - - alignments.back().push_back(maxS); + alignments.back().push_back( + std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) - + response.alignments[sentenceIdx][t].begin()); } // Next, we try to smooth out these selected alignments with a few heuristics @@ -622,7 +722,14 @@ void HTML::hardAlignments(Response const &response, std::vector= prevScore) { + TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags; + TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags; + + // If this token has more markup, or a better score than the previous + // token (and they together are part of a word-ish thing) then mark + // this word as aligning. Otherwise just copy the alignment source of + // the previous token. + if (extends(currTagStack, prevTagStack) || currScore >= prevScore) { // Apply this to all previous tokens in the word for (size_t i = t;; --i) { alignments.back()[i] = currSentenceIdx; @@ -640,6 +747,8 @@ void HTML::hardAlignments(Response const &response, std::vector +#include #include #include -#include +#include #include "annotation.h" +#include "data/types.h" #include "definitions.h" -namespace marian { -namespace bergamot { +namespace marian::bergamot { struct Response; +/// HTML class parses and removes HTML from input text, and places it back into +/// the translated output text. +/// +/// When parsing the HTML, it treats tags as markup, where a list of nested tags +/// can be seen as a list of markups that are applicable to all the text that +/// follows. This list is stored as a `TagStack`. Whenever an HTML tag opens or +/// closes, a new TagStack is created to reflect that. TagStack used to be +/// called `Taint` because it *tainted* the text it was associated with with +/// those tags as markup. The text between tags themselves is stored in the +/// input variable. In `spans_`, the TagStack that is associated with a +/// substring of that text is stored. +/// When transferring the HTML from the source text to the translated target +/// text, the TagStacks are first associated with each of the subwords from the +/// source text. Using hard alignment, each subword in the source text is linked +/// to a subword in the target text. The TagStacks are then copied over these +/// links. Finally, the HTML is inserted back into the target text by for each +/// subword, comparing the TagStack from the previous word to that word, and +/// opening and closing elements to make up for the difference. +/// +/// There are a couple of complexities though: +/// 1. Not all tags can be treated as markup applied to text. For example, an +/// `` does not contain text itself. Or `` does not. We do want +/// those tags to remain in the output though. We do this by associating +/// them to an empty `Span`. When inserting HTML back into the translation +/// input or output, we keep track of where in the `spans_` vector we are, +/// and insert any elements from empty spans that we might have skipped over +/// because empty spans are never linked to tokens/subwords. These are +/// *stragglers* in some parts of the code, or *void* or *empty* elements in +/// other parts. +/// 2. Some tags should be treated as paragraph indicators, and break up +/// sentences. These are the usual suspects like `

`, but also `

  • ` and +/// ``, to make sure we don't translate two table cells into a single +/// word. This is the `addSentenceBreak` flag in the HTML parsing bit. +/// We mark these breaks with `\n\n` in the input text and with a special +/// WHITESPACE tag that we treat as any other void tag. Hopefully this tag +/// moves with the added `\n\n` and it is easy for us to remove it again. +/// (in practise it is since these only occur at the end of sentences and +/// the end of sentences are always aligned between source and target.) +/// 3. We treat most tags as word-breaking. We do this by adding spaces just +/// after where we saw the open or close tag occur. If there is already +/// some whitespace in that place, we do not add extra spaces. +/// 4. TODO class HTML { public: + using TagNameSet = std::set>; + + /// Options struct that controls how HTML is interpreted. struct Options { - // List of elements for which we do not expect a closing tag, or self-closing - // elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element - // More relevant source of this list: - // https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791 - std::unordered_set voidTags{"area", "base", "basefont", "bgsound", "br", "col", - "embed", "frame", "hr", "img", "input", "keygen", - "link", "meta", "param", "source", "track", "wbr"}; - - std::unordered_set inlineTags{"abbr", "a", "b", "em", "i", "kbd", "mark", "math", - "output", "q", "ruby", "small", "span", "strong", "sub", "sup", - "time", "u", "var", "wbr", "ins", "del", "img"}; - - // List of characters that occur at the start of a token that indicate that - // the this token is probably *not* a continuation of a word. Set to empty - // to never mark a token as a continuation of the word. - // std::string continuationDelimiters = "\n ,.(){}[]"; - std::string continuationDelimiters; - - // Should we always add spaces to the places where tags used to be? I.e. - // `underline` should become `un der line`? + /// List of elements for which we do not expect a closing tag, or + /// self-closing elements in XHTML. We do not need to see a closing tag + /// for these elements, and they cannot contain text or tags themselves. + /// See also: + /// https://developer.mozilla.org/en-US/docs/Glossary/Empty_element. + /// More relevant source of this list: + /// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791 + TagNameSet voidTags{"area", "base", "basefont", "bgsound", "br", "col", "embed", "frame", "hr", + "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"}; + + /// List of elements that are treated as inline, meaning they do not break + /// up sentences. Any element *not* in this list will cause the text that + /// follows its open or close tag to be treated as a separate sentence. + TagNameSet inlineTags{"abbr", "a", "b", "em", "i", "kbd", "mark", "math", + "output", "q", "ruby", "small", "span", "strong", "sub", "sup", + "time", "u", "var", "wbr", "ins", "del", "img"}; + + /// List of elements that are, regardless of `substituteInlineTagsWithSpaces`, + /// not substituted with spaces. Technically almost all inline elements + /// should be treated like this, except `
    ` maybe, But in practice it + /// seems to be more effective to limit this set to just that one tag that + /// that can only really be used *inside* words: ``. + /// See also: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr + TagNameSet inWordTags{"wbr"}; + + /// List of elements we copy as is, but do parse as if they're HTML because + /// they could be nested. For because + /// the script tag may not be nested, but that is not the case for these + /// elements per se. Some tags, like