From 40eabc1a3393dc99e83b688178c8255f0483cadd Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Wed, 26 Jan 2022 17:33:58 +0000
Subject: [PATCH 01/29] Aggressively try to retain markup on words if it
 appears on one of its source tokens

I do need those continuation delimiters for that, even though I really don't like them since they're so character set focussed!
---
 src/translator/html.cpp | 27 ++++++++++++++++++++++++---
 src/translator/html.h   |  6 +++---
 2 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 20614f578..bcfcc15b6 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -138,6 +138,15 @@ bool containsTag(HTML::Taint const &stack, HTML::Tag const *tag) {
   return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend();
 }
 
+bool isSubset(HTML::Taint const &a, HTML::Taint const &b) {
+  if (a.size() > b.size()) return false;
+
+  for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j)
+    if (*i != *j) return false;
+
+  return true;
+}
+
 template <typename Fun>
 AnnotatedText apply(AnnotatedText const &in, Fun fun) {
   AnnotatedText out;
@@ -448,7 +457,7 @@ void HTML::restore(Response &response) {
 
   // Find for every token in target the token in source that best matches.
   std::vector<std::vector<size_t>> alignments;
-  hardAlignments(response, alignments);
+  hardAlignments(response, alignments, sourceTokenSpans);
 
   std::vector<SpanIterator> targetTokenSpans;
   copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
@@ -591,7 +600,10 @@ bool HTML::isContinuation(string_view prev, string_view str) {
          options_.continuationDelimiters.find(prev.back()) == std::string::npos;
 }
 
-void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
+void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
+                          std::vector<SpanIterator> const &sourceTokenSpans) {
+  size_t offset = 0;
+
   // For each sentence...
   for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
     alignments.emplace_back();
@@ -622,7 +634,14 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
         float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
         float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
 
-        if (currScore >= prevScore) {
+        Taint const &currTaint = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
+        Taint const &prevTaint = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
+
+        // If this token has more markup, or a better score than the previous
+        // token (and they together are part of a word-ish thing) then mark
+        // this word as aligning. Otherwise just copy the alignment source of
+        // the previous token.
+        if (isSubset(prevTaint, currTaint) || currScore >= prevScore) {
           // Apply this to all previous tokens in the word
           for (size_t i = t;; --i) {
             alignments.back()[i] = currSentenceIdx;
@@ -640,6 +659,8 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
 
     // Always align target end with source end
     alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
+
+    offset += response.source.numWords(sentenceIdx) + 1;  // +1 for prefix gap
   }
 }
 
diff --git a/src/translator/html.h b/src/translator/html.h
index d4cbd40d5..9939c51d6 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -37,8 +37,7 @@ class HTML {
     // List of characters that occur at the start of a token that indicate that
     // the this token is probably *not* a continuation of a word. Set to empty
     // to never mark a token as a continuation of the word.
-    // std::string continuationDelimiters = "\n ,.(){}[]";
-    std::string continuationDelimiters;
+    std::string continuationDelimiters = "\n ,.(){}[]";
 
     // Should we always add spaces to the places where tags used to be? I.e.
     // `un<u>der</u>line` should become `un der line`?
@@ -86,7 +85,8 @@ class HTML {
   void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
                  std::vector<HTML::SpanIterator> const &sourceTokenSpans,
                  std::vector<HTML::SpanIterator> &targetTokenSpans);
-  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments);
+  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
+                      std::vector<HTML::SpanIterator> const &sourceTokenSpans);
   bool isContinuation(string_view prev, string_view str);
   // Allocates tag in pool_ (which then owns it) and gives a pointer to be used
   // in Taints. Pointer is valid as long as this HTML instance lives on.

From 723e725bbb6fbbaae703a887d757f0705d9dab5f Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Wed, 26 Jan 2022 21:06:32 +0000
Subject: [PATCH 02/29] Outdated todo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🎉
---
 src/translator/response_options.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/translator/response_options.h b/src/translator/response_options.h
index b5867d00d..8ccfab856 100644
--- a/src/translator/response_options.h
+++ b/src/translator/response_options.h
@@ -19,7 +19,7 @@ struct ResponseOptions {
   bool qualityScores{false};  ///< Include quality-scores or not.
   bool alignment{false};      ///< Include alignments or not.
 
-  bool HTML{false};  /// Remove HTML tags from text and (TODO) insert in output.
+  bool HTML{false};  /// Remove HTML tags from text and insert in output.
 
   /// Whether to include sentenceMappings or not. Alignments require
   /// sentenceMappings and are available irrespective of this option if

From 9600c70b1725afabd07154355142b1b4ffebaf47 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Tue, 8 Feb 2022 13:00:53 +0000
Subject: [PATCH 03/29] Be explicit about where the two different string_view
 types are used

---
 src/translator/html.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index bcfcc15b6..8e8090a5a 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -4,17 +4,16 @@
 #include "xh_scanner.h"
 
 namespace {
-using marian::string_view;
 using marian::bergamot::AnnotatedText;
 using marian::bergamot::ByteRange;
 using marian::bergamot::HTML;
 using marian::bergamot::Response;
 
-void encodeEntities(string_view const &input, std::string &output) {
+void encodeEntities(marian::string_view const &input, std::string &output) {
   output.clear();
   output.reserve(input.size());  // assumes there are no entities in most cases
 
-  for (auto it = input.begin(); it != input.end(); ++it) {
+  for (const char *it = input.begin(); it != input.end(); ++it) {
     switch (*it) {
       case '&':
         output.append("&amp;");
@@ -41,7 +40,7 @@ void encodeEntities(string_view const &input, std::string &output) {
   }
 }
 
-size_t countPrefixWhitespaces(string_view const &input) {
+size_t countPrefixWhitespaces(marian::string_view const &input) {
   size_t size = 0;
   while (size < input.size() && std::isspace(input[size])) ++size;
   return size;
@@ -98,10 +97,10 @@ std::string format(std::string const &formatTemplate, Arg arg, Args... args) {
 // `for (auto &&item : reversed(container))` instead of the needlessly verbose
 // `for (auto it = container.rbegin(); it != container.rend(); ++it)`
 template <typename T>
-class reversed {
+class Reversed {
  public:
-  typedef typename T::const_reverse_iterator iterator;
-  explicit reversed(T const &container) : container_(container){};
+  using iterator = typename T::const_reverse_iterator;
+  explicit Reversed(T const &container) : container_(container){};
   iterator begin() const { return container_.rbegin(); }
   iterator end() const { return container_.rend(); }
 
@@ -167,9 +166,10 @@ AnnotatedText apply(AnnotatedText const &in, Fun fun) {
     // expects
     // TODO: extend AnnotatedText::appendSentence to accept str + ByteRanges
     // directly
-    std::vector<string_view> views(tokens.size());
-    std::transform(tokens.begin(), tokens.end(), views.begin(),
-                   [&](ByteRange const &range) { return string_view(sentence.data() + range.begin, range.size()); });
+    std::vector<marian::string_view> views(tokens.size());
+    std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
+      return marian::string_view(sentence.data() + range.begin, range.size());
+    });
 
     out.appendSentence(prefix, views.begin(), views.end());
   }
@@ -200,8 +200,8 @@ bool hasAlignments(Response const &response) {
 // Little helper class to append HTML to a token
 class TokenFormatter {
  public:
-  explicit TokenFormatter(string_view token)
-      : html_(), offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
+  explicit TokenFormatter(marian::string_view token)
+      : offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
     // Do encoding of any entities that popped up in the translation
     encodeEntities(token, html_);
   }
@@ -214,7 +214,7 @@ class TokenFormatter {
 
     diffTags(prev, curr, opening, closing);
 
-    for (HTML::Tag const *tag : reversed(closing)) {
+    for (HTML::Tag const *tag : Reversed(closing)) {
       assert(tag->type == HTML::Tag::ELEMENT);
       std::string closeTag = format("</{}>", tag->name);
       html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);

From 3d6673cb5ee3a307bd78b45636567dcd6b0a9069 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Tue, 8 Feb 2022 13:03:34 +0000
Subject: [PATCH 04/29] Make HTML tags case insensitive

Tag case is retained in the output though. Well, for the opening tag at least. Closing tag always matches opening tag.
---
 src/tests/units/html_tests.cpp | 10 ++++++++++
 src/translator/html.cpp        | 27 +++++++++++++++++++--------
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp
index 8b3ac0f24..4ca48ecf7 100644
--- a/src/tests/units/html_tests.cpp
+++ b/src/tests/units/html_tests.cpp
@@ -165,6 +165,16 @@ TEST_CASE("Do not abort if the input is just empty element") {
   CHECK(response.target.text == "<p></p>");
 }
 
+TEST_CASE("Tag names are case insensitive") {
+  // Tests <P> vs </p> and <BR> should be recognized as a void tag <br>.
+  // <B> should be recognized as inline.
+  std::string test_str("<P><B>Spa</B>ce<BR>please?</p>");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "Spa ce\n\nplease?");
+}
+
 TEST_CASE("Test case html entities") {
   // These are all entities I would expect in innerHTML, since all other entities
   // can be encoded as UTF-8 so there's no need to encode them through &...; when
diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 8e8090a5a..cb8740ec1 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -1,5 +1,7 @@
 #include "html.h"
 
+#include <algorithm>
+
 #include "response.h"
 #include "xh_scanner.h"
 
@@ -46,6 +48,13 @@ size_t countPrefixWhitespaces(marian::string_view const &input) {
   return size;
 }
 
+std::string toLowerCase(std::string_view const &input) {
+  std::string out;
+  out.resize(input.size());
+  std::transform(input.begin(), input.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });
+  return out;
+}
+
 // Formatters used for exception messages combined with format()
 std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
   if (tag == nullptr) return out << "[nullptr]";
@@ -335,10 +344,11 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
       } break;
 
       case markup::Scanner::TT_TAG_START: {
-        std::string name(scanner.tag());
+        std::string name = toLowerCase(scanner.tag());
 
         // Tag *tag is used by attribute parsing
-        tag = makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::move(name)});
+        tag =
+            makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::string(scanner.tag())});
 
         stack.push_back(tag);
 
@@ -351,21 +361,22 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         }
 
         // Treat non-inline HTML tags as spaces that break up words.
-        if (!contains(options_.inlineTags, tag->name)) {
+        if (!contains(options_.inlineTags, name)) {
           addSentenceBreak = true;
         } else {
           addSpace = true;
         }
       } break;
 
-      case markup::Scanner::TT_TAG_END:
+      case markup::Scanner::TT_TAG_END: {
+        std::string tagName = toLowerCase(scanner.tag());
         // If this is the closing bit of a void tag, i.e. triggered by the "/>"
         // bit of "<img/>", then completely ignore it.
-        if (contains(options_.voidTags, std::string(scanner.tag()))) break;
+        if (contains(options_.voidTags, tagName)) break;
 
         if (stack.empty()) throw BadHTML(format("Encountered more closing tags ({}) than opening tags", scanner.tag()));
 
-        if (stack.back()->name != scanner.tag())
+        if (toLowerCase(stack.back()->name) != toLowerCase(scanner.tag()))
           throw BadHTML(format("Encountered unexpected closing tag </{}>, stack is {}", scanner.tag(), stack));
 
         // What to do with "<u></u>" case, where tag is immediately closed
@@ -377,12 +388,12 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         stack.pop_back();
 
         // Add space if necessary
-        if (!contains(options_.inlineTags, std::string(scanner.tag()))) {
+        if (!contains(options_.inlineTags, tagName)) {
           addSentenceBreak = true;
         } else {
           addSpace = true;
         }
-        break;
+      } break;
 
       case markup::Scanner::TT_ATTRIBUTE:
         assert(tag != nullptr);

From 5634c40a53240a1debceec8dd6552f7afa13ade3 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Tue, 8 Feb 2022 13:04:44 +0000
Subject: [PATCH 05/29] Treat <wbr> special

Fixes #339
---
 src/tests/units/html_tests.cpp | 16 ++++++++++++++++
 src/translator/html.cpp        |  4 ++--
 src/translator/html.h          |  3 +++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp
index 4ca48ecf7..e50a60bbf 100644
--- a/src/tests/units/html_tests.cpp
+++ b/src/tests/units/html_tests.cpp
@@ -621,6 +621,22 @@ TEST_CASE("Test comment") {
   CHECK(response.target.text == test_str);
 }
 
+TEST_CASE("Test <wbr> element") {
+  std::string test_str("hel<wbr>lo");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello");
+}
+
+TEST_CASE("Test <wbr> element (case-insensitive)") {
+  std::string test_str("hel<WBR>lo");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello");
+}
+
 TEST_CASE("End-to-end translation", "[!mayfail]") {
   std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
   HTML html(std::move(input), true);
diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index cb8740ec1..520c98fbf 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -363,7 +363,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         // Treat non-inline HTML tags as spaces that break up words.
         if (!contains(options_.inlineTags, name)) {
           addSentenceBreak = true;
-        } else {
+        } else if (!contains(options_.inWordTags, name)) {
           addSpace = true;
         }
       } break;
@@ -390,7 +390,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         // Add space if necessary
         if (!contains(options_.inlineTags, tagName)) {
           addSentenceBreak = true;
-        } else {
+        } else if (!contains(options_.inWordTags, tagName)) {
           addSpace = true;
         }
       } break;
diff --git a/src/translator/html.h b/src/translator/html.h
index 9939c51d6..8f30eb6d6 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -34,6 +34,9 @@ class HTML {
                                                "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
                                                "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
 
+    // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
+    std::unordered_set<std::string> inWordTags{"wbr"};
+
     // List of characters that occur at the start of a token that indicate that
     // the this token is probably *not* a continuation of a word. Set to empty
     // to never mark a token as a continuation of the word.

From e516dbdaba0522e496cfc80ecf15a2b79a2513a2 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Tue, 8 Feb 2022 16:42:32 +0000
Subject: [PATCH 06/29] Add support for ignoring tags

Fixes #313
---
 src/tests/units/html_tests.cpp |  8 ++++
 src/translator/html.cpp        | 83 ++++++++++++++++++++++++++++++++++
 src/translator/html.h          |  6 +++
 3 files changed, 97 insertions(+)

diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp
index e50a60bbf..ae6493c30 100644
--- a/src/tests/units/html_tests.cpp
+++ b/src/tests/units/html_tests.cpp
@@ -637,6 +637,14 @@ TEST_CASE("Test <wbr> element (case-insensitive)") {
   CHECK(input == "hello");
 }
 
+TEST_CASE("Test ignored element") {
+  std::string test_str("hello <var>this is <var>nested</var> var</var> world");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello  world");
+}
+
 TEST_CASE("End-to-end translation", "[!mayfail]") {
   std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
   HTML html(std::move(input), true);
diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 520c98fbf..279b68ef1 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -7,6 +7,7 @@
 
 namespace {
 using marian::bergamot::AnnotatedText;
+using marian::bergamot::BadHTML;
 using marian::bergamot::ByteRange;
 using marian::bergamot::HTML;
 using marian::bergamot::Response;
@@ -284,6 +285,80 @@ size_t debugCountTokens(AnnotatedText const &text) {
   return tokens;
 }
 
+// Helper function that consumes a tag as if it is a special tag, except that it
+// takes nesting into account. I.e. `<a><a></a></a>` will be consumed to the
+// last `</a>`. Assumes TT_TAG_START is already consumed, which was necessary
+// to determine whether this was an element that needed to be ignored.
+void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) {
+  // Only full elements can be consumed this way. With void tags we don't know
+  // where to stop scanning. All other types cannot be nested anyway.
+  assert(tag.type == HTML::Tag::ELEMENT);
+
+  // TT_TAG_START is already consumed.
+  markup::Scanner::TokenType token;
+  size_t inside = 0;
+
+  // Consume the full open tag, i.e. all its attributes
+  while (!inside) {
+    token = scanner.next();
+    switch (token) {
+      case markup::Scanner::TT_ERROR:
+        throw BadHTML("HTML parse error");
+      case markup::Scanner::TT_EOF:
+        throw BadHTML(format("Did not find closing tag </{}>", name));
+      case markup::Scanner::TT_ATTRIBUTE:
+        tag.attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value());
+        break;
+      default:
+        // Not an attribute! Must be something inside the body or the closing
+        // tag already. Time to jump to the next loop.
+        ++inside;
+        break;
+    }
+  }
+
+  // Last token was something that would have triggered Scanner::scanBody(),
+  // which sets value() to start pointing at the body.
+  const char *start = scanner.value().data();
+
+  while (inside) {
+    switch (token) {
+      case markup::Scanner::TT_ERROR:
+        throw BadHTML("HTML parse error");
+      case markup::Scanner::TT_EOF:
+        throw BadHTML(format("Did not find closing tag </{}>", name));
+      case markup::Scanner::TT_TAG_START:
+      case markup::Scanner::TT_TAG_END:
+        // Note: Looking specifically for only our own type of tag so we don't
+        // have to care about whether other tags we encounter are void tags or
+        // not. Does assume the HTML is valid, as no stack is kept.
+        if (toLowerCase(scanner.tag()) == name) {
+          if (token == markup::Scanner::TT_TAG_END) {
+            if (--inside == 0) break;  // also stops loop because !inside
+          } else {
+            ++inside;
+          }
+        }
+        // intentional fall-through to scanner.next()!
+      default:
+        token = scanner.next();
+        break;
+    }
+  }
+
+  // Only a TAG_END could have stopped the previous loop. If we know the name
+  // of that closing element, e.g. `</code>`, we also know the position of
+  // where the body ended. 2 (`</`) characters before it!
+  assert(token == markup::Scanner::TT_TAG_END);
+  const char *end = scanner.tag().data() - 2;
+
+  // All data between the end of the first open element, and the start of the
+  // last close element, we just treat as raw data that will be printed when
+  // this tag is eventually printed.
+  assert(end >= start);
+  tag.data = std::string_view(start, end - start);
+}
+
 }  // namespace
 
 namespace marian::bergamot {
@@ -360,6 +435,14 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
           stack.pop_back();
         }
 
+        // Ignored tags have same semantics as void tags with regards to moving
+        // them around with the rest of the content.
+        if (contains(options_.ignoredTags, name)) {
+          consumeIgnoredTag(scanner, *tag, name);
+          spans_.push_back(Span{source.size(), source.size(), stack});
+          stack.pop_back();
+        }
+
         // Treat non-inline HTML tags as spaces that break up words.
         if (!contains(options_.inlineTags, name)) {
           addSentenceBreak = true;
diff --git a/src/translator/html.h b/src/translator/html.h
index 8f30eb6d6..467aafa9a 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -37,6 +37,12 @@ class HTML {
     // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
     std::unordered_set<std::string> inWordTags{"wbr"};
 
+    // List of elements we copy as is, but do parse as if they're HTML because
+    // they could be nested. For <script> we just scan for </script> because
+    // the script tag may not be nested, but that is not the case for these
+    // elements per se.
+    std::unordered_set<std::string> ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math", "textarea"};
+
     // List of characters that occur at the start of a token that indicate that
     // the this token is probably *not* a continuation of a word. Set to empty
     // to never mark a token as a continuation of the word.

From 46159ba8180c041baf725e38b7bdfb8aea0d5ac2 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Wed, 9 Feb 2022 11:09:57 +0000
Subject: [PATCH 07/29] Add test for regression in ignored element code path

std::bad_alloc :( Also expand tests to make sure we're recording the full ignored tag contents.
---
 src/tests/units/html_tests.cpp | 48 +++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp
index ae6493c30..0329ef436 100644
--- a/src/tests/units/html_tests.cpp
+++ b/src/tests/units/html_tests.cpp
@@ -637,12 +637,54 @@ TEST_CASE("Test <wbr> element (case-insensitive)") {
   CHECK(input == "hello");
 }
 
-TEST_CASE("Test ignored element") {
-  std::string test_str("hello <var>this is <var>nested</var> var</var> world");
+TEST_CASE("Test ignored element (nested)") {
+  std::string test_str("foo <var><var>nested</var></var> bar");
+  std::string expected_str("foo  <var><var>nested</var></var>bar");
 
   std::string input(test_str);
   HTML html(std::move(input), true);
-  CHECK(input == "hello  world");
+  CHECK(input == "foo  bar");
+
+  Response response;
+  std::string sentence_str("foo  bar");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 3),  // foo
+      string_view(sentence_str.data() + 3, 1),  // _
+      string_view(sentence_str.data() + 4, 4),  // _bar
+      string_view(sentence_str.data() + 8, 0),  // ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+  CHECK(response.source.text == expected_str);
+  CHECK(response.target.text == expected_str);
+}
+
+TEST_CASE("Test ignored element (with entity)") {
+  std::string test_str("foo <var>&amp;</var> bar");
+  std::string expected_str("foo  <var>&amp;</var>bar");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "foo  bar");
+
+  Response response;
+  std::string sentence_str("foo  bar");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 3),  // foo
+      string_view(sentence_str.data() + 3, 1),  // _
+      string_view(sentence_str.data() + 4, 4),  // _bar
+      string_view(sentence_str.data() + 8, 0),  // ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+  CHECK(response.source.text == expected_str);
+  CHECK(response.target.text == expected_str);
 }
 
 TEST_CASE("End-to-end translation", "[!mayfail]") {

From af39c75d1a44579fd025605467a28ab46c88bc25 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Wed, 9 Feb 2022 12:38:50 +0000
Subject: [PATCH 08/29] Fix bad_alloc in consumeIgnoredTag

Trouble was that `Scanner::scanEntity()` returns a value() that does not point to inside the HTML input stream (but to a *decoded* entity instead). So we need another API, `Scanner::start()`, to figure out where a token starts in HTML.
---
 src/translator/html.cpp       | 10 +++++-----
 src/translator/xh_scanner.cpp | 10 ++++++++++
 src/translator/xh_scanner.h   |  6 ++++++
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 279b68ef1..2af2030ee 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -319,8 +319,9 @@ void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string con
 
   // Last token was something that would have triggered Scanner::scanBody(),
   // which sets value() to start pointing at the body.
-  const char *start = scanner.value().data();
+  const char *start = scanner.start();
 
+  // Consume the rest of the HTML until (including) the final closing tag.
   while (inside) {
     switch (token) {
       case markup::Scanner::TT_ERROR:
@@ -346,11 +347,10 @@ void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string con
     }
   }
 
-  // Only a TAG_END could have stopped the previous loop. If we know the name
-  // of that closing element, e.g. `</code>`, we also know the position of
-  // where the body ended. 2 (`</`) characters before it!
+  // Only a TAG_END could have stopped the previous loop. We take the start
+  // of the final closing tag as the end of our data.
   assert(token == markup::Scanner::TT_TAG_END);
-  const char *end = scanner.tag().data() - 2;
+  const char *end = scanner.start();
 
   // All data between the end of the first open element, and the start of the
   // last close element, we just treat as raw data that will be printed when
diff --git a/src/translator/xh_scanner.cpp b/src/translator/xh_scanner.cpp
index 85eb7e972..050f0acf9 100644
--- a/src/translator/xh_scanner.cpp
+++ b/src/translator/xh_scanner.cpp
@@ -52,6 +52,8 @@ std::string_view Scanner::tag() const { return std::string_view(tagName_.data, t
 Scanner::TokenType Scanner::scanBody() {
   value_ = string_ref{input_.pos(), 0};
 
+  start_ = input_.pos();
+
   switch (input_.peek()) {
     case '\0':
       return TT_EOF;
@@ -198,6 +200,7 @@ Scanner::TokenType Scanner::scanAttribute() {
 // - TT_ENTITY_START
 // - TT_ERROR if unexpected character or end
 Scanner::TokenType Scanner::scanTag() {
+  start_ = input_.pos();
   if (input_.consume() != '<') return TT_ERROR;
 
   bool is_tail = input_.peek() == '/';
@@ -234,6 +237,7 @@ Scanner::TokenType Scanner::scanTag() {
 
 Scanner::TokenType Scanner::scanEntity(TokenType parentTokenType) {
   // `entity` includes starting '&' and ending ';'
+  start_ = input_.pos();
   string_ref entity{input_.pos(), 0};
   bool hasEnd = false;
 
@@ -312,11 +316,13 @@ bool Scanner::isWhitespace(char c) {
 
 Scanner::TokenType Scanner::scanComment() {
   if (gotTail_) {
+    start_ = input_.pos() - 3;  // minus "-->"
     scanFun_ = &Scanner::scanBody;
     gotTail_ = false;
     return TT_COMMENT_END;
   }
 
+  start_ = input_.pos();
   value_ = string_ref{input_.pos(), 0};
 
   while (true) {
@@ -334,11 +340,13 @@ Scanner::TokenType Scanner::scanComment() {
 
 Scanner::TokenType Scanner::scanProcessingInstruction() {
   if (gotTail_) {
+    start_ = input_.pos() - 2;
     scanFun_ = &Scanner::scanBody;
     gotTail_ = false;
     return TT_PROCESSING_INSTRUCTION_END;
   }
 
+  start_ = input_.pos();
   value_ = string_ref{input_.pos(), 0};
 
   while (true) {
@@ -356,11 +364,13 @@ Scanner::TokenType Scanner::scanProcessingInstruction() {
 
 Scanner::TokenType Scanner::scanSpecial() {
   if (gotTail_) {
+    start_ = input_.pos() - (tagName_.size + 3);
     scanFun_ = &Scanner::scanBody;
     gotTail_ = false;
     return TT_TAG_END;
   }
 
+  start_ = input_.pos();
   value_ = string_ref{input_.pos(), 0};
 
   while (true) {
diff --git a/src/translator/xh_scanner.h b/src/translator/xh_scanner.h
index 14d755bbd..530df675d 100644
--- a/src/translator/xh_scanner.h
+++ b/src/translator/xh_scanner.h
@@ -83,6 +83,7 @@ class Scanner {
         tagName_{nullptr, 0},
         attributeName_{nullptr, 0},
         input_(is),
+        start_(nullptr),
         scanFun_(&Scanner::scanBody),
         gotTail_(false) {}
 
@@ -98,6 +99,8 @@ class Scanner {
   // get tag name
   std::string_view tag() const;
 
+  inline const char *start() const { return start_; }
+
  private: /* methods */
   typedef TokenType (Scanner::*ScanPtr)();
 
@@ -137,6 +140,9 @@ class Scanner {
 
   instream &input_;
 
+  // Start position of a token.
+  const char *start_;
+
   bool gotTail_;  // aux flag used in scanComment, scanSpecial, scanProcessingInstruction
 };
 }  // namespace markup

From f595c5189053232672811d16572659b63062edea Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Fri, 11 Feb 2022 12:11:41 +0000
Subject: [PATCH 09/29] Prevent straggler void elements to show up twice

When a word near the of a translated sentence aligns with one at the beginning, it pushes prevIt back to the beginning. Then the next translated token will insert all straggler void elements between prevIt and it. Instead of using prevIt to track where we were with inserting stragglers, we keep our own iterator that never moves backwards.
---
 src/translator/html.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 2af2030ee..6f0dca942 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -606,27 +606,28 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
 AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans) {
   auto prevSpan = spans_.cbegin();
   auto targetSpanIt = targetTokenSpans.begin();
+  auto straggerSpanIt = spans_.cbegin();
 
   AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
     TokenFormatter formatter(token);
 
     // First we scan through spans_ to catch up to the span assigned to this
     // token. We're only interested in empty spans (empty and void elements)
-    for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) {
+    for (; straggerSpanIt < *targetSpanIt; ++straggerSpanIt) {
       // We're only interested in empty spans or spans that would otherwise get
       // lost because they didn't align with anything between the spans in
       // targetSpanIt
       // TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
-      if (span_it->size() != 0 &&
-          std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end())
+      if (straggerSpanIt->size() != 0 &&
+          std::find(targetTokenSpans.begin(), targetTokenSpans.end(), straggerSpanIt) != targetTokenSpans.end())
         continue;
 
-      formatter.append(prevSpan->tags, span_it->tags);
+      formatter.append(prevSpan->tags, straggerSpanIt->tags);
 
       // Note: here, not in 3rd part of for-statement because we don't want to
       // set prevSpan if the continue clause at the beginning of this for-loop
       // was hit.
-      prevSpan = span_it;
+      prevSpan = straggerSpanIt;
     }
 
     // Now do the same thing but for our target set of tags. Note that we cannot

From 32f403ab36e3a9e23ab59d8f835773d5d52636f4 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Fri, 11 Feb 2022 23:09:03 +0000
Subject: [PATCH 10/29] Use isContinuation function to check whether we need to
 insert a space after a tag

Main reason for using this instead of `std::isspace` is to prevent a space being inserted between the tag and the full stop in `This is a <b>test</b>.`. Because that has been bothering me a lot.
---
 src/translator/html.cpp | 9 ++++++---
 src/translator/html.h   | 5 ++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 6f0dca942..7d662345f 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -406,8 +406,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         // If the previous segment was an open or close tag, it might be best
         // to add a space to make sure we don't append to the previous word.
         if (addSpace) {
-          if (options_.substituteInlineTagsWithSpaces && !source.empty() && !std::isspace(source.back()) &&
-              !std::isspace(scanner.value()[0])) {
+          if (options_.substituteInlineTagsWithSpaces && isContinuation(source, scanner.value())) {
             source.push_back(' ');
           }
           addSpace = false;
@@ -688,13 +687,17 @@ void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>>
 // to determine whether we should share the markup, or whether we should see
 // this token as a fresh start. This implementation will treat "hello[world]"
 // as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
-bool HTML::isContinuation(string_view prev, string_view str) {
+bool HTML::isContinuation(std::string_view prev, std::string_view str) {
   if (options_.continuationDelimiters.empty()) return false;
   if (prev.empty() || str.empty()) return false;
   return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
          options_.continuationDelimiters.find(prev.back()) == std::string::npos;
 }
 
+bool HTML::isContinuation(marian::string_view prev, marian::string_view str) {
+  return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
+}
+
 void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
                           std::vector<SpanIterator> const &sourceTokenSpans) {
   size_t offset = 0;
diff --git a/src/translator/html.h b/src/translator/html.h
index 467aafa9a..0bbaf05cd 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -4,9 +4,11 @@
 #include <forward_list>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <unordered_set>
 
 #include "annotation.h"
+#include "data/types.h"
 #include "definitions.h"
 
 namespace marian {
@@ -96,7 +98,8 @@ class HTML {
                  std::vector<HTML::SpanIterator> &targetTokenSpans);
   void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
                       std::vector<HTML::SpanIterator> const &sourceTokenSpans);
-  bool isContinuation(string_view prev, string_view str);
+  bool isContinuation(marian::string_view prev, marian::string_view str);
+  bool isContinuation(std::string_view prev, std::string_view str);
   // Allocates tag in pool_ (which then owns it) and gives a pointer to be used
   // in Taints. Pointer is valid as long as this HTML instance lives on.
   Tag *makeTag(Tag &&tag);

From 72e54f82bfe948fd46ad72ee72bce6d3ec0b886c Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 14 Feb 2022 13:41:59 +0000
Subject: [PATCH 11/29] Treat more elements as opaque when parsing

These are all elements that Firefox treats as opaque in their HTML5 parser. As a consequence, when you'd request `noscriptElement.innerHTML` you'd get the raw text content of the thing, as opposed to a serialized tree. So invalid HTML? Just passed on as is! Well, we're going to do the same then. Besides, if noscript then also probably no extension.
---
 src/translator/html.h         |  2 +-
 src/translator/xh_scanner.cpp | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/translator/html.h b/src/translator/html.h
index f4f7d88f2..8c18272d9 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -38,7 +38,7 @@ class HTML {
     // they could be nested. For <script> we just scan for </script> because
     // the script tag may not be nested, but that is not the case for these
     // elements per se.
-    std::unordered_set<std::string> ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math", "textarea"};
+    std::unordered_set<std::string> ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};
 
     // List of characters that occur at the start of a token that indicate that
     // the this token is probably *not* a continuation of a word. Set to empty
diff --git a/src/translator/xh_scanner.cpp b/src/translator/xh_scanner.cpp
index 050f0acf9..a62f05f25 100644
--- a/src/translator/xh_scanner.cpp
+++ b/src/translator/xh_scanner.cpp
@@ -99,15 +99,16 @@ Scanner::TokenType Scanner::scanAttribute() {
   switch (input_.peek()) {
     case '>':
       input_.consume();
-      if (equalsCaseInsensitive(tagName_, "script")) {
+
+      // Treat some elements as opaque, e.g. <script>, <style>
+      if (equalsCaseInsensitive(tagName_, "title") || equalsCaseInsensitive(tagName_, "script") ||
+          equalsCaseInsensitive(tagName_, "style") || equalsCaseInsensitive(tagName_, "textarea") ||
+          equalsCaseInsensitive(tagName_, "iframe") || equalsCaseInsensitive(tagName_, "noembed") ||
+          equalsCaseInsensitive(tagName_, "noscript") || equalsCaseInsensitive(tagName_, "noframes")) {
         // script is special because we want to parse the attributes,
         // but not the content
         scanFun_ = &Scanner::scanSpecial;
         return scanSpecial();
-      } else if (equalsCaseInsensitive(tagName_, "style")) {
-        // same with style
-        scanFun_ = &Scanner::scanSpecial;
-        return scanSpecial();
       } else {
         scanFun_ = &Scanner::scanBody;
         return scanBody();

From ea244d2497bb0b314eda5c0822c13c42e51da8e8 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 14 Feb 2022 17:51:17 +0000
Subject: [PATCH 12/29] Do not skip `<title>` for now

This tag is a bit difficult. No HTML is allowed inside of it (e.g. similar to `<textarea>`) but we do want to capture it's text content as text (decoding entities etc.) so we can translate it. So for now I'll just trust that nobody is insane enough to use HTML inside the title tag. And if they do, we'll be as insane back and try to maintain that (very much not allowed) structure.
---
 src/translator/xh_scanner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/translator/xh_scanner.cpp b/src/translator/xh_scanner.cpp
index a62f05f25..f9d9a43b3 100644
--- a/src/translator/xh_scanner.cpp
+++ b/src/translator/xh_scanner.cpp
@@ -101,7 +101,7 @@ Scanner::TokenType Scanner::scanAttribute() {
       input_.consume();
 
       // Treat some elements as opaque, e.g. <script>, <style>
-      if (equalsCaseInsensitive(tagName_, "title") || equalsCaseInsensitive(tagName_, "script") ||
+      if (/*equalsCaseInsensitive(tagName_, "title") ||*/ equalsCaseInsensitive(tagName_, "script") ||
           equalsCaseInsensitive(tagName_, "style") || equalsCaseInsensitive(tagName_, "textarea") ||
           equalsCaseInsensitive(tagName_, "iframe") || equalsCaseInsensitive(tagName_, "noembed") ||
           equalsCaseInsensitive(tagName_, "noscript") || equalsCaseInsensitive(tagName_, "noframes")) {

From dda9860f578e768f3a66cbe2a9a79de03751ad7f Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Wed, 16 Feb 2022 13:12:39 +0000
Subject: [PATCH 13/29] Follow clang-tidy advice

---
 src/translator/html.cpp | 37 ++++++++++++++++++++-----------------
 src/translator/html.h   |  7 +++----
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 61b28d174..4567fd407 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -15,8 +15,8 @@ void encodeEntities(marian::string_view const &input, std::string &output) {
   output.clear();
   output.reserve(input.size());  // assumes there are no entities in most cases
 
-  for (const char *it = input.begin(); it != input.end(); ++it) {
-    switch (*it) {
+  for (char it : input) {
+    switch (it) {
       case '&':
         output.append("&amp;");
         break;
@@ -36,7 +36,7 @@ void encodeEntities(marian::string_view const &input, std::string &output) {
       //   output.append("&apos;");
       //   break;
       default:
-        output.push_back(*it);
+        output.push_back(it);
         break;
     }
   }
@@ -99,7 +99,7 @@ void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &ope
   opening.clear();
   closing.clear();
 
-  size_t i = 0;
+  std::int64_t i = 0;
 
   // Find first difference
   for (; i < prev.size(); ++i)
@@ -129,6 +129,18 @@ bool isSubset(HTML::Taint const &a, HTML::Taint const &b) {
   return true;
 }
 
+template <typename T>
+size_t argmax(std::vector<T> const &items) {
+  assert(!items.empty());
+  size_t best = 0;
+  for (size_t i = 1; i + 1 < items.size(); ++i) {
+    if (items[i] > items[best]) {
+      best = i;
+    }
+  }
+  return best;
+}
+
 template <typename Fun>
 AnnotatedText apply(AnnotatedText const &in, Fun fun) {
   AnnotatedText out;
@@ -147,8 +159,6 @@ AnnotatedText apply(AnnotatedText const &in, Fun fun) {
 
     // Convert our ByteRanges to string_views since that's what appendSentence
     // expects
-    // TODO: extend AnnotatedText::appendSentence to accept str + ByteRanges
-    // directly
     std::vector<marian::string_view> views(tokens.size());
     std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
       return marian::string_view(sentence.data() + range.begin, range.size());
@@ -606,7 +616,7 @@ AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanItera
   auto targetSpanIt = targetTokenSpans.begin();
   auto straggerSpanIt = spans_.cbegin();
 
-  AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
+  AnnotatedText out = apply(in, [&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
     TokenFormatter formatter(token);
 
     // First we scan through spans_ to catch up to the span assigned to this
@@ -686,14 +696,14 @@ void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>>
 // to determine whether we should share the markup, or whether we should see
 // this token as a fresh start. This implementation will treat "hello[world]"
 // as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
-bool HTML::isContinuation(std::string_view prev, std::string_view str) {
+bool HTML::isContinuation(std::string_view prev, std::string_view str) const {
   if (options_.continuationDelimiters.empty()) return false;
   if (prev.empty() || str.empty()) return false;
   return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
          options_.continuationDelimiters.find(prev.back()) == std::string::npos;
 }
 
-bool HTML::isContinuation(marian::string_view prev, marian::string_view str) {
+bool HTML::isContinuation(marian::string_view prev, marian::string_view str) const {
   return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
 }
 
@@ -709,14 +719,7 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
     // Note: only search from 0 to N-1 because token N is end-of-sentence token
     // that can only align with the end-of-sentence token of the target
     for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
-      size_t maxS = 0;
-      for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
-        if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
-          maxS = s;
-        }
-      }
-
-      alignments.back().push_back(maxS);
+      alignments.back().push_back(::argmax(response.alignments[sentenceIdx][t]));
     }
 
     // Next, we try to smooth out these selected alignments with a few heuristics
diff --git a/src/translator/html.h b/src/translator/html.h
index 8c18272d9..436cbb0e2 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -11,8 +11,7 @@
 #include "data/types.h"
 #include "definitions.h"
 
-namespace marian {
-namespace bergamot {
+namespace marian::bergamot {
 
 struct Response;
 
@@ -93,8 +92,8 @@ class HTML {
                  std::vector<HTML::SpanIterator> &targetTokenSpans);
   void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
                       std::vector<HTML::SpanIterator> const &sourceTokenSpans);
-  bool isContinuation(marian::string_view prev, marian::string_view str);
-  bool isContinuation(std::string_view prev, std::string_view str);
+  bool isContinuation(marian::string_view prev, marian::string_view str) const;
+  bool isContinuation(std::string_view prev, std::string_view str) const;
   // Allocates tag in pool_ (which then owns it) and gives a pointer to be used
   // in Taints. Pointer is valid as long as this HTML instance lives on.
   Tag *makeTag(Tag &&tag);

From d7e1c07ec310a9186321308f40b3e72f083dee26 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Wed, 16 Feb 2022 13:13:33 +0000
Subject: [PATCH 14/29] Fix missing \n\n?

I don't know what happened here.
---
 src/translator/html.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 4567fd407..289df0d7b 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -400,7 +400,8 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         // If the previous segment was the open or close tag of a block element
         // we treat the text after it as a new sentence.
         if (addSentenceBreak) {
-          if (!(source.empty() || (source.size() > 2 && source.substr(source.size() - 2) == ""))) {
+          // If there isn't already a \n\n at the end of source...
+          if (source.size() < 2 || source.substr(source.size() - 2) != "\n\n") {
             stack.push_back(makeTag({Tag::WHITESPACE}));
             // Important: span->size() == 0 to make it behave as a void element.
             // Also important: position before the \n\n tokens, not after, to

From 203ba0ae1189416fe279b7727bc8d4e9660bf92d Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Wed, 16 Feb 2022 13:16:44 +0000
Subject: [PATCH 15/29] Add more comments and less creative variable names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hopefully this will make the overall code more readable given you're familiar with the concept it tries to implement…
---
 src/translator/html.cpp | 116 ++++++++++++++++++++++++----------------
 src/translator/html.h   |  41 +++++++-------
 2 files changed, 92 insertions(+), 65 deletions(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 289df0d7b..305165019 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -11,6 +11,7 @@ using marian::bergamot::ByteRange;
 using marian::bergamot::HTML;
 using marian::bergamot::Response;
 
+/// Encodes the minimum of HTML entities.
 void encodeEntities(marian::string_view const &input, std::string &output) {
   output.clear();
   output.reserve(input.size());  // assumes there are no entities in most cases
@@ -42,6 +43,8 @@ void encodeEntities(marian::string_view const &input, std::string &output) {
   }
 }
 
+/// Counts number of whitespace characters at the start of the input. Used
+/// for determining where to insert an open or close tag.
 size_t countPrefixWhitespaces(marian::string_view const &input) {
   size_t size = 0;
   while (size < input.size() && std::isspace(input[size])) ++size;
@@ -55,7 +58,9 @@ std::string toLowerCase(std::string_view const &input) {
   return out;
 }
 
-// Very simple replacement for std::format introduced in C++20
+/// Very simple replacement for std::format introduced in C++20. Only supports
+/// replacing `{}` in the template string with whatever `operator<<` for that
+/// type turns it into.
 std::string format(std::string const &formatTemplate) { return formatTemplate; }
 
 template <typename Arg>
@@ -76,9 +81,9 @@ std::string format(std::string const &formatTemplate, Arg arg, Args... args) {
   return os.str();
 }
 
-// Syntactic sugar around rbegin() and rend() that allows me to write
-// `for (auto &&item : reversed(container))` instead of the needlessly verbose
-// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
+/// Syntactic sugar around rbegin() and rend() that allows me to write
+/// `for (auto &&item : reversed(container))` instead of the needlessly verbose
+/// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
 template <typename T>
 class Reversed {
  public:
@@ -91,11 +96,10 @@ class Reversed {
   T const &container_;
 };
 
-bool contains(std::unordered_set<std::string> const &set, std::string const &name) {
-  return set.find(name) != set.end();
-}
-
-void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &opening, HTML::Taint &closing) {
+/// When comparing two tag stacks, determine which tags need to be closed and
+/// opened to get from one stack to the other.
+void diffTags(HTML::TagStack const &prev, HTML::TagStack const &curr, HTML::TagStack &opening,
+              HTML::TagStack &closing) {
   opening.clear();
   closing.clear();
 
@@ -116,11 +120,13 @@ bool intersects(ByteRange const &range, HTML::Span const &span) {
   return range.begin <= span.end && range.end >= span.begin;
 };
 
-bool containsTag(HTML::Taint const &stack, HTML::Tag const *tag) {
+bool contains(HTML::TagNameSet const &set, std::string_view const &name) { return set.find(name) != set.end(); }
+
+bool contains(HTML::TagStack const &stack, HTML::Tag const *tag) {
   return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend();
 }
 
-bool isSubset(HTML::Taint const &a, HTML::Taint const &b) {
+bool isSubset(HTML::TagStack const &a, HTML::TagStack const &b) {
   if (a.size() > b.size()) return false;
 
   for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j)
@@ -141,6 +147,10 @@ size_t argmax(std::vector<T> const &items) {
   return best;
 }
 
+/// Utility function to call `fun` on each word (subword token effectively) in
+/// an `AnnotatedText`. `fun` is called with the `ByteRange`, the `string_view`
+/// with the word, and a `bool` to indicate whether it is the last word in the
+/// `AnnotatedText`, which is also the ending whitespace slot of AnnotatedText.
 template <typename Fun>
 AnnotatedText apply(AnnotatedText const &in, Fun fun) {
   AnnotatedText out;
@@ -172,6 +182,7 @@ AnnotatedText apply(AnnotatedText const &in, Fun fun) {
   return out;
 }
 
+/// Tests whether `response` has alignment info associated with it or not.
 bool hasAlignments(Response const &response) {
   // Test for each sentence individually as a sentence may be empty (or there)
   // might be no sentences, so just testing for alignments.empty() would not be
@@ -190,7 +201,8 @@ bool hasAlignments(Response const &response) {
   return true;
 }
 
-// Little helper class to append HTML to a token
+/// Helper class to append HTML tags to a token. Also makes sure the token is
+/// encoded as valid HTML.
 class TokenFormatter {
  public:
   explicit TokenFormatter(marian::string_view token)
@@ -202,8 +214,8 @@ class TokenFormatter {
   std::string &&html() { return std::move(html_); }
 
   // Append the markup necessary for moving from `prev` set of tags to `curr`.
-  void append(HTML::Taint const &prev, HTML::Taint const &curr) {
-    HTML::Taint opening, closing;
+  void append(HTML::TagStack const &prev, HTML::TagStack const &curr) {
+    HTML::TagStack opening, closing;
 
     diffTags(prev, curr, opening, closing);
 
@@ -260,6 +272,8 @@ class TokenFormatter {
   bool closeLeft_;
 };
 
+/// Count the number of tokens in an AnnotatedText. Used to assert we're not
+/// running out of sync when creating vectors that describe each token.
 size_t debugCountTokens(AnnotatedText const &text) {
   size_t tokens = 1;  // for the ending gap
   for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) {
@@ -268,10 +282,10 @@ size_t debugCountTokens(AnnotatedText const &text) {
   return tokens;
 }
 
-// Helper function that consumes a tag as if it is a special tag, except that it
-// takes nesting into account. I.e. `<a><a></a></a>` will be consumed to the
+/// Helper function that consumes a tag as if it is a special tag, except that
+/// it takes nesting into account. I.e. `<a><a></a></a>` will be consumed to the
 // last `</a>`. Assumes TT_TAG_START is already consumed, which was necessary
-// to determine whether this was an element that needed to be ignored.
+/// to determine whether this was an element that needed to be ignored.
 void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) {
   // Only full elements can be consumed this way. With void tags we don't know
   // where to stop scanning. All other types cannot be nested anyway.
@@ -346,7 +360,7 @@ void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string con
 
 namespace marian::bergamot {
 
-// Formatters used for exception messages combined with format()
+/// Formatters used for formatting error messages in ABORT() calls.
 std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
   if (tag == nullptr) return out << "[nullptr]";
   switch (tag->type) {
@@ -364,7 +378,7 @@ std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
   return out << "[Unknown tag type]";
 }
 
-std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) {
+std::ostream &operator<<(std::ostream &out, HTML::TagStack const &tags) {
   for (auto it = tags.begin(); it != tags.end(); ++it) {
     if (it != tags.begin()) out << ' ';
     out << *it;
@@ -372,18 +386,20 @@ std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) {
   return out;
 }
 
-HTML::HTML(std::string &&source, bool process_markup, Options &&options) : options_(std::move(options)) {
-  if (!process_markup) return;
+HTML::HTML(std::string &&source, bool processMarkup, Options &&options) : options_(std::move(options)) {
+  if (!processMarkup) return;
 
   std::string original = std::move(source);
   markup::instream in(original.data(), original.data() + original.size());
   markup::Scanner scanner(in);
   source.clear();  // source is moved out of, so should be clear anyway
 
-  Tag *tag;
-  Taint stack;
-  bool addSentenceBreak = false;
-  bool addSpace = false;
+  Tag *tag = nullptr;             // current tag (after opening at least)
+  TagStack stack;                 // stack of currently open tags
+  bool addSentenceBreak = false;  // whether to add a sentence break next text segment
+  bool addWordBreak = false;      // whether to add a word break next text segment
+
+  // Starting point: an empty span with no open tags.
   spans_.push_back(Span{0, 0, {}});
 
   bool stop = false;
@@ -407,7 +423,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
             // Also important: position before the \n\n tokens, not after, to
             // make it easier to remove them later through apply().
             spans_.push_back(Span{source.size(), source.size(), stack});
-            source.append("\n\n");  // TODO assumes ssplit-mode = wrapped_text
+            source.append("\n\n");  // Should work with ssplit-mode = wrapped_text
             stack.pop_back();
           }
           addSentenceBreak = false;
@@ -415,13 +431,16 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
 
         // If the previous segment was an open or close tag, it might be best
         // to add a space to make sure we don't append to the previous word.
-        if (addSpace) {
+        if (addWordBreak) {
+          // Only add the space when it would be inside a word. Do not add it if
+          // it would be between a word and punctuation.
           if (options_.substituteInlineTagsWithSpaces && isContinuation(source, scanner.value())) {
             source.push_back(' ');
           }
-          addSpace = false;
+          addWordBreak = false;
         }
 
+        // Store which tags were open when this span of text was encountered.
         auto begin = source.size();
         source.append(scanner.value());
         spans_.push_back(Span{begin, source.size(), stack});
@@ -431,8 +450,8 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         std::string name = toLowerCase(scanner.tag());
 
         // Tag *tag is used by attribute parsing
-        tag =
-            makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::string(scanner.tag())});
+        auto type = contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT;
+        tag = makeTag({type, std::string(scanner.tag())});
 
         stack.push_back(tag);
 
@@ -456,7 +475,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         if (!contains(options_.inlineTags, name)) {
           addSentenceBreak = true;
         } else if (!contains(options_.inWordTags, name)) {
-          addSpace = true;
+          addWordBreak = true;
         }
       } break;
 
@@ -474,7 +493,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         // What to do with "<u></u>" case, where tag is immediately closed
         // so it never makes it into the taint of any of the spans? This adds
         // an empty span so it still gets recorded in spans_.
-        if (spans_.empty() || !containsTag(spans_.back().tags, stack.back()))
+        if (spans_.empty() || !contains(spans_.back().tags, stack.back()))
           spans_.push_back(Span{source.size(), source.size(), stack});
 
         stack.pop_back();
@@ -483,7 +502,7 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         if (!contains(options_.inlineTags, tagName)) {
           addSentenceBreak = true;
         } else if (!contains(options_.inWordTags, tagName)) {
-          addSpace = true;
+          addWordBreak = true;
         }
       } break;
 
@@ -563,7 +582,7 @@ void HTML::restore(Response &response) {
   hardAlignments(response, alignments, sourceTokenSpans);
 
   std::vector<SpanIterator> targetTokenSpans;
-  copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
+  copyTagStack(response, alignments, sourceTokenSpans, targetTokenSpans);
   assert(targetTokenSpans.size() == debugCountTokens(response.target));
 
   AnnotatedText target = restoreTarget(response.target, targetTokenSpans);
@@ -587,9 +606,11 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
     //   spans     |1|   |2|    |3333| (so only 2 is tainted with <p><u>, others only <p>)
     //  tokens     |111111111111111|2|
     //
-    // Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
-    // Note: only relevant if isBlockElement is used. If we just insert spaces
-    // around all elements, every segment of `hello` will be a token.
+    // Now 1 covers span 1 to 3, so what taint should it get? Just `<p>`, or
+    // `<p><u>`?
+    // Note: only relevant if `substituteInlineTagsWithSpaces` is true. If we
+    // just insert spaces around all elements, every segment of `hello` will be
+    // a token.
 
     // Seek to the last span that overlaps with this token
     while (true) {
@@ -606,7 +627,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
 
     // TODO: This is just the taint of the last span, not the ones in between.
     // This makes us lose some markup of parts of tokens as described above.
-    sourceTokenSpans.push_back(prevIt);
+    sourceTokenSpans.emplace_back(prevIt);
 
     return std::move(formatter.html());
   });
@@ -652,7 +673,7 @@ AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanItera
       // the last token of the output. But lets assume someone someday changes
       // HardAlignments(), and then this for-loop will be necessary.
       // assert((*targetSpanIt)->tags.empty());
-      formatter.append((*targetSpanIt)->tags, HTML::Taint());
+      formatter.append((*targetSpanIt)->tags, HTML::TagStack());
     }
 
     prevSpan = *targetSpanIt;
@@ -672,8 +693,9 @@ HTML::Tag *HTML::makeTag(Tag &&tag) {
   return &pool_.front();
 }
 
-void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
-                     std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
+void HTML::copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                        std::vector<SpanIterator> const &sourceTokenSpans,
+                        std::vector<SpanIterator> &targetTokenSpans) {
   size_t offset = 0;  // Sentence offset in sourceTokenSpans
 
   // Fill targetTokenSpans based on the alignments we just made up.
@@ -708,9 +730,13 @@ bool HTML::isContinuation(marian::string_view prev, marian::string_view str) con
   return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
 }
 
+/// Selects for each token in `response.target` a best source token from
+/// `response.source` and writes this selection to `alignments`. The source
+/// token spans are used to also look at the markup applied to each token to
+/// figure out which source token best represents each target token.
 void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
                           std::vector<SpanIterator> const &sourceTokenSpans) {
-  size_t offset = 0;
+  size_t offset = 0;  // sentence offset in sourceTokenSpans
 
   // For each sentence...
   for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
@@ -735,14 +761,14 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
         float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
         float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
 
-        Taint const &currTaint = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
-        Taint const &prevTaint = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
+        TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
+        TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
 
         // If this token has more markup, or a better score than the previous
         // token (and they together are part of a word-ish thing) then mark
         // this word as aligning. Otherwise just copy the alignment source of
         // the previous token.
-        if (isSubset(prevTaint, currTaint) || currScore >= prevScore) {
+        if (isSubset(prevTagStack, currTagStack) || currScore >= prevScore) {
           // Apply this to all previous tokens in the word
           for (size_t i = t;; --i) {
             alignments.back()[i] = currSentenceIdx;
diff --git a/src/translator/html.h b/src/translator/html.h
index 436cbb0e2..a71490fba 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -2,10 +2,10 @@
 #define SRC_BERGAMOT_HTML_H_
 
 #include <forward_list>
+#include <set>
 #include <stdexcept>
 #include <string>
 #include <string_view>
-#include <unordered_set>
 
 #include "annotation.h"
 #include "data/types.h"
@@ -17,27 +17,29 @@ struct Response;
 
 class HTML {
  public:
+  using TagNameSet = std::set<std::string, std::less<>>;
+
   struct Options {
     // List of elements for which we do not expect a closing tag, or self-closing
     // elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
     // More relevant source of this list:
     // https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
-    std::unordered_set<std::string> voidTags{"area",  "base",  "basefont", "bgsound", "br",    "col",
-                                             "embed", "frame", "hr",       "img",     "input", "keygen",
-                                             "link",  "meta",  "param",    "source",  "track", "wbr"};
+    TagNameSet voidTags{"area", "base",  "basefont", "bgsound", "br",   "col",   "embed",  "frame", "hr",
+                        "img",  "input", "keygen",   "link",    "meta", "param", "source", "track", "wbr"};
 
-    std::unordered_set<std::string> inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
-                                               "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
-                                               "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
+    TagNameSet inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
+                          "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
+                          "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
 
     // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
-    std::unordered_set<std::string> inWordTags{"wbr"};
+    TagNameSet inWordTags{"wbr"};
 
     // List of elements we copy as is, but do parse as if they're HTML because
     // they could be nested. For <script> we just scan for </script> because
     // the script tag may not be nested, but that is not the case for these
-    // elements per se.
-    std::unordered_set<std::string> ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};
+    // elements per se. Some tags, like <script>, are ignored at the xh_scanner
+    // level. See xh_scanner.cpp/Scanner::scanAttribute().
+    TagNameSet ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};
 
     // List of characters that occur at the start of a token that indicate that
     // the this token is probably *not* a continuation of a word. Set to empty
@@ -68,17 +70,17 @@ class HTML {
     // `attributes` and `data` with string_views pointing to it.
   };
 
-  using Taint = std::vector<Tag *>;
+  using TagStack = std::vector<Tag *>;
 
   struct Span {
     size_t begin;
     size_t end;
-    Taint tags;  // Note: free pointers! Lifetime of tags is managed by pool_
+    TagStack tags;  // Note: free pointers! Lifetime of tags is managed by pool_
     inline size_t size() const { return end - begin; }
   };
 
-  explicit HTML(std::string &&source, bool process_markup) : HTML(std::move(source), process_markup, HTML::Options{}){};
-  explicit HTML(std::string &&source, bool process_markup, Options &&options);
+  explicit HTML(std::string &&source, bool processMarkup) : HTML(std::move(source), processMarkup, HTML::Options{}){};
+  explicit HTML(std::string &&source, bool processMarkup, Options &&options);
   void restore(Response &response);
 
  private:
@@ -87,15 +89,15 @@ class HTML {
 
   AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
   AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
-  void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
-                 std::vector<HTML::SpanIterator> const &sourceTokenSpans,
-                 std::vector<HTML::SpanIterator> &targetTokenSpans);
+  void copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                    std::vector<HTML::SpanIterator> const &sourceTokenSpans,
+                    std::vector<HTML::SpanIterator> &targetTokenSpans);
   void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
                       std::vector<HTML::SpanIterator> const &sourceTokenSpans);
   bool isContinuation(marian::string_view prev, marian::string_view str) const;
   bool isContinuation(std::string_view prev, std::string_view str) const;
   // Allocates tag in pool_ (which then owns it) and gives a pointer to be used
-  // in Taints. Pointer is valid as long as this HTML instance lives on.
+  // in TagStacks. Pointer is valid as long as this HTML instance lives on.
   Tag *makeTag(Tag &&tag);
 
   Options options_;
@@ -107,7 +109,6 @@ class HTML {
   std::forward_list<Tag> pool_;
 };
 
-}  // namespace bergamot
-}  // namespace marian
+}  // namespace marian::bergamot
 
 #endif  // SRC_BERGAMOT_HTML_H_

From a1ee8e9de3ecb757ac67ba1e7c715d0c7f9117f6 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Wed, 16 Feb 2022 13:18:48 +0000
Subject: [PATCH 16/29] Too many negations and my head just negates itself

---
 src/translator/html.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 305165019..afa4dd2a6 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -417,7 +417,7 @@ HTML::HTML(std::string &&source, bool processMarkup, Options &&options) : option
         // we treat the text after it as a new sentence.
         if (addSentenceBreak) {
           // If there isn't already a \n\n at the end of source...
-          if (source.size() < 2 || source.substr(source.size() - 2) != "\n\n") {
+          if (source.size() >= 2 && source.substr(source.size() - 2) != "\n\n") {
             stack.push_back(makeTag({Tag::WHITESPACE}));
             // Important: span->size() == 0 to make it behave as a void element.
             // Also important: position before the \n\n tokens, not after, to

From ac83e509b41acf186cfa402bb2cd1d7e142b806c Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 21 Feb 2022 13:03:56 +0000
Subject: [PATCH 17/29] Update bergamot-translator-tests

---
 bergamot-translator-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bergamot-translator-tests b/bergamot-translator-tests
index 3c0f95a17..c8a6db3cd 160000
--- a/bergamot-translator-tests
+++ b/bergamot-translator-tests
@@ -1 +1 @@
-Subproject commit 3c0f95a1775a74f5db441aa2f17ceb7437679022
+Subproject commit c8a6db3cdd887048a1d2c381d11ee0de03e6b8cd

From 6a7bd2176b3256e98741aa2db8ac92caf44c4a46 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 21 Feb 2022 16:14:31 +0000
Subject: [PATCH 18/29] Update tests

---
 bergamot-translator-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bergamot-translator-tests b/bergamot-translator-tests
index c8a6db3cd..4cda39cec 160000
--- a/bergamot-translator-tests
+++ b/bergamot-translator-tests
@@ -1 +1 @@
-Subproject commit c8a6db3cdd887048a1d2c381d11ee0de03e6b8cd
+Subproject commit 4cda39cecd1d0ec8b9ca8a4ff02ad608ae01b7cd

From c90d00f9dd4292f7355b3e3101e7015236ca007b Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 21 Feb 2022 16:27:39 +0000
Subject: [PATCH 19/29] Replace snake_case and magic numbers

---
 src/translator/xh_scanner.cpp | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/translator/xh_scanner.cpp b/src/translator/xh_scanner.cpp
index f9d9a43b3..724d02cb9 100644
--- a/src/translator/xh_scanner.cpp
+++ b/src/translator/xh_scanner.cpp
@@ -37,6 +37,11 @@ bool operator==(markup::string_ref const &str, const Char_t (&str2)[Len]) {
   return str.size == Len - 1 && std::memcmp(str.data, str2, Len - 1) == 0;
 }
 
+template <size_t N>
+constexpr size_t length(char const (&/*unused*/)[N]) {
+  return N - 1;
+}
+
 }  // end namespace
 
 namespace markup {
@@ -204,8 +209,8 @@ Scanner::TokenType Scanner::scanTag() {
   start_ = input_.pos();
   if (input_.consume() != '<') return TT_ERROR;
 
-  bool is_tail = input_.peek() == '/';
-  if (is_tail) input_.consume();
+  bool isTail = input_.peek() == '/';
+  if (isTail) input_.consume();
 
   tagName_ = string_ref{input_.pos(), 0};
 
@@ -230,7 +235,7 @@ Scanner::TokenType Scanner::scanTag() {
 
   if (!input_.peek()) return TT_EOF;
 
-  if (is_tail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;
+  if (isTail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;
 
   scanFun_ = &Scanner::scanAttribute;
   return TT_TAG_START;
@@ -317,7 +322,7 @@ bool Scanner::isWhitespace(char c) {
 
 Scanner::TokenType Scanner::scanComment() {
   if (gotTail_) {
-    start_ = input_.pos() - 3;  // minus "-->"
+    start_ = input_.pos() - length("-->");  // minus "-->"
     scanFun_ = &Scanner::scanBody;
     gotTail_ = false;
     return TT_COMMENT_END;
@@ -332,7 +337,7 @@ Scanner::TokenType Scanner::scanComment() {
 
     if (endsWith(value_, "-->")) {
       gotTail_ = true;
-      value_.size -= 3;
+      value_.size -= length("-->");
       break;
     }
   }
@@ -341,7 +346,7 @@ Scanner::TokenType Scanner::scanComment() {
 
 Scanner::TokenType Scanner::scanProcessingInstruction() {
   if (gotTail_) {
-    start_ = input_.pos() - 2;
+    start_ = input_.pos() - length("?>");
     scanFun_ = &Scanner::scanBody;
     gotTail_ = false;
     return TT_PROCESSING_INSTRUCTION_END;
@@ -356,7 +361,7 @@ Scanner::TokenType Scanner::scanProcessingInstruction() {
 
     if (endsWith(value_, "?>")) {
       gotTail_ = true;
-      value_.size -= 2;
+      value_.size -= length("?>");
       break;
     }
   }
@@ -365,7 +370,7 @@ Scanner::TokenType Scanner::scanProcessingInstruction() {
 
 Scanner::TokenType Scanner::scanSpecial() {
   if (gotTail_) {
-    start_ = input_.pos() - (tagName_.size + 3);
+    start_ = input_.pos() - (tagName_.size + length("</>"));
     scanFun_ = &Scanner::scanBody;
     gotTail_ = false;
     return TT_TAG_END;
@@ -380,17 +385,17 @@ Scanner::TokenType Scanner::scanSpecial() {
 
     // Test for </tag>
     // TODO: no whitespaces allowed? Is that okay?
-    if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + 3) {
+    if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + length("</>")) {
       // Test for the "</"" bit of "</tag>"
-      size_t pos_tag_start = value_.size - tagName_.size - 3;
-      if (std::memcmp(value_.data + pos_tag_start, "</", 2) != 0) continue;
+      size_t posTagStart = value_.size - tagName_.size - length("</>");
+      if (std::memcmp(value_.data + posTagStart, "</", length("</")) != 0) continue;
 
       // Test for the "tag" bit of "</tag>". Doing case insensitive compare because <I>...</i> is okay.
-      size_t pos_tag_name = value_.size - tagName_.size - 1;  // end - tag>
-      if (!equalsCaseInsensitive(value_.data + pos_tag_name, tagName_.data, tagName_.size)) continue;
+      size_t posTagName = value_.size - tagName_.size - length(">");  // end - tag>
+      if (!equalsCaseInsensitive(value_.data + posTagName, tagName_.data, tagName_.size)) continue;
 
       gotTail_ = true;
-      value_.size -= tagName_.size + 3;
+      value_.size -= tagName_.size + length("</>");
       break;
     }
   }

From ad612e49c054a700c9b63bd2e6b75c92e2754ee6 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 21 Feb 2022 16:28:15 +0000
Subject: [PATCH 20/29] Use std::max_element instead of own implementation

---
 src/translator/html.cpp | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index afa4dd2a6..1334495b1 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -135,18 +135,6 @@ bool isSubset(HTML::TagStack const &a, HTML::TagStack const &b) {
   return true;
 }
 
-template <typename T>
-size_t argmax(std::vector<T> const &items) {
-  assert(!items.empty());
-  size_t best = 0;
-  for (size_t i = 1; i + 1 < items.size(); ++i) {
-    if (items[i] > items[best]) {
-      best = i;
-    }
-  }
-  return best;
-}
-
 /// Utility function to call `fun` on each word (subword token effectively) in
 /// an `AnnotatedText`. `fun` is called with the `ByteRange`, the `string_view`
 /// with the word, and a `bool` to indicate whether it is the last word in the
@@ -746,7 +734,9 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
     // Note: only search from 0 to N-1 because token N is end-of-sentence token
     // that can only align with the end-of-sentence token of the target
     for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
-      alignments.back().push_back(::argmax(response.alignments[sentenceIdx][t]));
+      alignments.back().push_back(
+          std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
+          response.alignments[sentenceIdx][t].begin());
     }
 
     // Next, we try to smooth out these selected alignments with a few heuristics

From f45198313d1efcf1ce7ef02560d3a82b3784688d Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 21 Feb 2022 16:42:35 +0000
Subject: [PATCH 21/29] Rename isSubset to extends (and flip argument order for
 readability)

---
 src/translator/html.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 1334495b1..60f92c8f9 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -126,7 +126,9 @@ bool contains(HTML::TagStack const &stack, HTML::Tag const *tag) {
   return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend();
 }
 
-bool isSubset(HTML::TagStack const &a, HTML::TagStack const &b) {
+/// Is tag stack B an extended version of A? I.e. same tags, but maybe a few
+/// more nested deeper.
+bool extends(HTML::TagStack const &b, HTML::TagStack const &a) {
   if (a.size() > b.size()) return false;
 
   for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j)
@@ -758,7 +760,7 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
         // token (and they together are part of a word-ish thing) then mark
         // this word as aligning. Otherwise just copy the alignment source of
         // the previous token.
-        if (isSubset(prevTagStack, currTagStack) || currScore >= prevScore) {
+        if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
           // Apply this to all previous tokens in the word
           for (size_t i = t;; --i) {
             alignments.back()[i] = currSentenceIdx;

From c891eda5a2f91b05448a7b38c3f4674772fca653 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 21 Feb 2022 16:47:07 +0000
Subject: [PATCH 22/29] Move apply(AnnotatedText const&, Fun) to AnnotatedText
 itself.

---
 src/translator/annotation.h | 35 +++++++++++++++++++++++++++++++++
 src/translator/html.cpp     | 39 ++-----------------------------------
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/translator/annotation.h b/src/translator/annotation.h
index 785d49dfe..5a17dfcfe 100644
--- a/src/translator/annotation.h
+++ b/src/translator/annotation.h
@@ -185,6 +185,41 @@ struct AnnotatedText {
   /// Returns a ByteRange representing sentence corresponding to sentenceIdx.
   ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); }
 
+  /// Utility function to call `fun` on each word (subword token effectively) in
+  /// an `AnnotatedText`. `fun` is called with the `ByteRange`, the `string_view`
+  /// with the word, and a `bool` to indicate whether it is the last word in the
+  /// `AnnotatedText`, which is also the ending whitespace slot of AnnotatedText.
+  template <typename Fun>
+  AnnotatedText apply(Fun fun) const {
+    AnnotatedText out;
+
+    for (size_t sentenceIdx = 0; sentenceIdx < numSentences(); ++sentenceIdx) {
+      std::string sentence;
+      std::vector<ByteRange> tokens;
+
+      std::string prefix = fun(annotation.gap(sentenceIdx), gap(sentenceIdx), false);
+
+      for (size_t wordIdx = 0; wordIdx < numWords(sentenceIdx); ++wordIdx) {
+        std::string token = fun(wordAsByteRange(sentenceIdx, wordIdx), word(sentenceIdx, wordIdx), false);
+        tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
+        sentence += token;
+      }
+
+      // Convert our ByteRanges to string_views since that's what appendSentence
+      // expects
+      std::vector<marian::string_view> views(tokens.size());
+      std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
+        return marian::string_view(sentence.data() + range.begin, range.size());
+      });
+
+      out.appendSentence(prefix, views.begin(), views.end());
+    }
+
+    out.appendEndingWhitespace(fun(annotation.gap(numSentences()), gap(numSentences()), true));
+
+    return out;
+  }
+
  private:
   string_view asStringView(const ByteRange &byteRange) const {
     return string_view(text.data() + byteRange.begin, byteRange.size());
diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 60f92c8f9..55c058216 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -137,41 +137,6 @@ bool extends(HTML::TagStack const &b, HTML::TagStack const &a) {
   return true;
 }
 
-/// Utility function to call `fun` on each word (subword token effectively) in
-/// an `AnnotatedText`. `fun` is called with the `ByteRange`, the `string_view`
-/// with the word, and a `bool` to indicate whether it is the last word in the
-/// `AnnotatedText`, which is also the ending whitespace slot of AnnotatedText.
-template <typename Fun>
-AnnotatedText apply(AnnotatedText const &in, Fun fun) {
-  AnnotatedText out;
-
-  for (size_t sentenceIdx = 0; sentenceIdx < in.numSentences(); ++sentenceIdx) {
-    std::string sentence;
-    std::vector<ByteRange> tokens;
-
-    std::string prefix = fun(in.annotation.gap(sentenceIdx), in.gap(sentenceIdx), false);
-
-    for (size_t wordIdx = 0; wordIdx < in.numWords(sentenceIdx); ++wordIdx) {
-      std::string token = fun(in.wordAsByteRange(sentenceIdx, wordIdx), in.word(sentenceIdx, wordIdx), false);
-      tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
-      sentence += token;
-    }
-
-    // Convert our ByteRanges to string_views since that's what appendSentence
-    // expects
-    std::vector<marian::string_view> views(tokens.size());
-    std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
-      return marian::string_view(sentence.data() + range.begin, range.size());
-    });
-
-    out.appendSentence(prefix, views.begin(), views.end());
-  }
-
-  out.appendEndingWhitespace(fun(in.annotation.gap(in.numSentences()), in.gap(in.numSentences()), true));
-
-  return out;
-}
-
 /// Tests whether `response` has alignment info associated with it or not.
 bool hasAlignments(Response const &response) {
   // Test for each sentence individually as a sentence may be empty (or there)
@@ -587,7 +552,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
                                  // and the while-loop below will do the rest
   assert(prevIt == spans_.end() || prevIt->tags.empty());
 
-  return apply(in, [&](ByteRange range, string_view token, bool last) {
+  return in.apply([&](ByteRange range, string_view token, bool last) {
     TokenFormatter formatter(token);
 
     // Potential issue: spans and tokens can intersect, e.g.
@@ -628,7 +593,7 @@ AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanItera
   auto targetSpanIt = targetTokenSpans.begin();
   auto straggerSpanIt = spans_.cbegin();
 
-  AnnotatedText out = apply(in, [&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
+  AnnotatedText out = in.apply([&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
     TokenFormatter formatter(token);
 
     // First we scan through spans_ to catch up to the span assigned to this

From 54be426b06943b6c05c283f977e05e2ca8c0f352 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 21 Feb 2022 16:57:41 +0000
Subject: [PATCH 23/29] Try to reduce the number of nested conditions in
 consumeIgnoredTag a bit

---
 src/translator/html.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 55c058216..a3c125d6f 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -273,7 +273,9 @@ void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string con
   // which sets value() to start pointing at the body.
   const char *start = scanner.start();
 
-  // Consume the rest of the HTML until (including) the final closing tag.
+  // Consume the rest of the HTML until (including) the final closing tag. We
+  // start with the token that caused the previous loop to fall into the default
+  // case.
   while (inside) {
     switch (token) {
       case markup::Scanner::TT_ERROR:
@@ -281,22 +283,22 @@ void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string con
       case markup::Scanner::TT_EOF:
         ABORT("Did not find closing tag </{}>");
       case markup::Scanner::TT_TAG_START:
-      case markup::Scanner::TT_TAG_END:
         // Note: Looking specifically for only our own type of tag so we don't
         // have to care about whether other tags we encounter are void tags or
         // not. Does assume the HTML is valid, as no stack is kept.
-        if (toLowerCase(scanner.tag()) == name) {
-          if (token == markup::Scanner::TT_TAG_END) {
-            if (--inside == 0) break;  // also stops loop because !inside
-          } else {
-            ++inside;
-          }
-        }
-        // intentional fall-through to scanner.next()!
+        if (toLowerCase(scanner.tag()) == name) ++inside;
+        break;
+      case markup::Scanner::TT_TAG_END:
+        if (toLowerCase(scanner.tag()) == name) --inside;
+        break;
       default:
-        token = scanner.next();
         break;
     }
+
+    // Only continue scanning if we're still inside. We could have just read the
+    // TT_TAG_END token that ended this element, and we don't want to continue
+    // consuming tokens at that point.
+    if (inside) token = scanner.next();
   }
 
   // Only a TAG_END could have stopped the previous loop. We take the start

From 279462cb819514b1b64532e6444a56fcfbfc2c7e Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 21 Feb 2022 17:40:27 +0000
Subject: [PATCH 24/29] Update tests for Ubuntu 18.04/avx2

---
 bergamot-translator-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bergamot-translator-tests b/bergamot-translator-tests
index 4cda39cec..3776609ce 160000
--- a/bergamot-translator-tests
+++ b/bergamot-translator-tests
@@ -1 +1 @@
-Subproject commit 4cda39cecd1d0ec8b9ca8a4ff02ad608ae01b7cd
+Subproject commit 3776609ce5f7a238245e303efaa007b2d5078180

From 346821b231840f6dc83301738f7cdbbdb1b30a3d Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Mon, 21 Feb 2022 18:03:03 +0000
Subject: [PATCH 25/29] Revert int64_t to size_t (and mute tidy complaining
 about it)

---
 src/translator/html.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index a3c125d6f..ed42b9117 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -103,16 +103,18 @@ void diffTags(HTML::TagStack const &prev, HTML::TagStack const &curr, HTML::TagS
   opening.clear();
   closing.clear();
 
-  std::int64_t i = 0;
+  size_t i = 0;
 
   // Find first difference
   for (; i < prev.size(); ++i)
     if (i >= curr.size() || prev[i] != curr[i]) break;
 
   // Only nodes of type ELEMENT can have children and thus would need a closing tag.
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions)
   std::copy_if(prev.begin() + i, prev.end(), std::back_inserter(closing),
                [&](HTML::Tag *tag) { return tag->type == HTML::Tag::ELEMENT; });
 
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions)
   opening.insert(opening.end(), curr.begin() + i, curr.end());
 }
 

From 48cfc00378e18c52211963aabc30b0d48cd0f96c Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Tue, 22 Feb 2022 19:01:36 +0000
Subject: [PATCH 26/29] Bit more high level documentation on how HTML class
 works.

---
 src/translator/html.h | 154 ++++++++++++++++++++++++++++++++----------
 1 file changed, 120 insertions(+), 34 deletions(-)

diff --git a/src/translator/html.h b/src/translator/html.h
index a71490fba..0ccaf7f97 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -15,49 +15,108 @@ namespace marian::bergamot {
 
 struct Response;
 
+/// HTML class parses and removes HTML from input text, and places it back into
+/// the translated output text.
+///
+/// When parsing the HTML, it treats tags as markup, where a list of nested tags
+/// can be seen as a list of markups that are applicable to all the text that
+/// follows. This list is stored as a `TagStack`. Whenever an HTML tag opens or
+/// closes, a new TagStack is created to reflect that. The text between tags
+/// themselves is stored in the input variable. In `spans_`, the TagStack that
+/// is associated with a substring of that text is stored.
+/// When transferring the HTML from the source text to the translated target
+/// text, the TagStacks are first associated with each of the subwords from the
+/// source text. Using hard alignment, each subword in the source text is linked
+/// to a subword in the target text. The TagStacks are then copied over these
+/// links. Finally, the HTML is inserted back into the target text by for each
+/// subword, comparing the TagStack from the previous word to that word, and
+/// opening and closing elements to make up for the difference.
+///
+/// There are a couple of complexities though:
+/// 1. Not all tags can be treated as markup applied to text. For example, an
+///    `<img>` does not contain text itself. Or `<i></i>` does not. We do want
+///    those tags to remain in the output though. We do this by associating
+///    them to an empty `Span`. When inserting HTML back into the translation
+///    input or output, we keep track of where in the `spans_` vector we are,
+///    and insert any elements from empty spans that we might have skipped over
+///    because empty spans are never linked to tokens/subwords. These are
+///    *stragglers* in some parts of the code, or *void* or *empty* elements in
+///    other parts.
+/// 2. Some tags should be treated as paragraph indicators, and break up
+///    sentences. These are the usual suspects like `<p>`, but also `<li>` and
+///    `<td>`, to make sure we don't translate two table cells into a single
+///    word. This is the `addSentenceBreak` flag in the HTML parsing bit.
+///    We mark these breaks with `\n\n` in the input text and with a special
+///    WHITESPACE tag that we treat as any other void tag. Hopefully this tag
+///    moves with the added `\n\n` and it is easy for us to remove it again.
+///    (in practise it is since these only occur at the end of sentences and
+///    the end of sentences are always aligned between source and target.)
+/// 3. We treat most tags as word-breaking. We do this by adding spaces just
+///    after where we saw the open or close tag occur. If there is already
+///    some whitespace in that place, we do not add extra spaces.
+/// 4. TODO
 class HTML {
  public:
   using TagNameSet = std::set<std::string, std::less<>>;
 
+  /// Options struct that controls how HTML is interpreted.
   struct Options {
-    // List of elements for which we do not expect a closing tag, or self-closing
-    // elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
-    // More relevant source of this list:
-    // https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
+    /// List of elements for which we do not expect a closing tag, or
+    /// self-closing elements in XHTML. We do not need to see a closing tag
+    /// for these elements, and they cannot contain text or tags themselves.
+    /// See also:
+    /// https://developer.mozilla.org/en-US/docs/Glossary/Empty_element.
+    /// More relevant source of this list:
+    /// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
     TagNameSet voidTags{"area", "base",  "basefont", "bgsound", "br",   "col",   "embed",  "frame", "hr",
                         "img",  "input", "keygen",   "link",    "meta", "param", "source", "track", "wbr"};
 
+    /// List of elements that are treated as inline, meaning they do not break
+    /// up sentences. Any element *not* in this list will cause the text that
+    /// follows its open or close tag to be treated as a separate sentence.
     TagNameSet inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
                           "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
                           "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
 
-    // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
+    /// List of elements that are, regardless of `substituteInlineTagsWithSpaces`,
+    /// not substituted with spaces. Technically almost all inline elements
+    /// should be treated like this, except `<br>` maybe, But in practice it
+    /// seems to be more effective to limit this set to just that one tag that
+    /// that can only really be used *inside* words: `<wbr>`.
+    /// See also: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
     TagNameSet inWordTags{"wbr"};
 
-    // List of elements we copy as is, but do parse as if they're HTML because
-    // they could be nested. For <script> we just scan for </script> because
-    // the script tag may not be nested, but that is not the case for these
-    // elements per se. Some tags, like <script>, are ignored at the xh_scanner
-    // level. See xh_scanner.cpp/Scanner::scanAttribute().
+    /// List of elements we copy as is, but do parse as if they're HTML because
+    /// they could be nested. For <script> we just scan for </script> because
+    /// the script tag may not be nested, but that is not the case for these
+    /// elements per se. Some tags, like <script>, are ignored at the `Scanner`
+    /// level. See `xh_scanner.cpp/Scanner::scanAttribute()`.
     TagNameSet ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};
 
-    // List of characters that occur at the start of a token that indicate that
-    // the this token is probably *not* a continuation of a word. Set to empty
-    // to never mark a token as a continuation of the word.
+    /// List of characters that occur at the start of a token that indicate that
+    /// the this token is probably *not* a continuation of a word. This is also
+    /// used to determine whether there should be a space after a closing tag
+    /// or not. I.e. a `.` after a `</strong>` does not need to be separated by
+    /// an extra space.
     std::string continuationDelimiters = "\n ,.(){}[]";
 
-    // Should we always add spaces to the places where tags used to be? I.e.
-    // `un<u>der</u>line` should become `un der line`?
+    /// Should we always add spaces to the places where tags used to be? I.e.
+    /// `un<u>der</u>line` should become `un der line`? This does help with
+    /// retaining tags inside words, or with odd pages that use CSS to add
+    /// spacing between a lot of tags. Cases like `<td>` and `<li>` are already
+    /// covered by treating them as sentence splitting.
     bool substituteInlineTagsWithSpaces = true;
   };
 
+  /// Represents a tag, or markup that is being applied to a string of text.
+  /// We treat all elements except `ELEMENT` as void elements or empty elements.
   struct Tag {
     enum NodeType {
-      ELEMENT,
-      VOID_ELEMENT,
-      COMMENT,
-      PROCESSING_INSTRUCTION,
-      WHITESPACE,  // negative space
+      ELEMENT,                 // <b>...</b>
+      VOID_ELEMENT,            // <img>
+      COMMENT,                 // <!-- ... -->
+      PROCESSING_INSTRUCTION,  // <?...?>
+      WHITESPACE,              // A \n\n we inserted to break a sentence.
     };
 
     NodeType type;           // Type of the node
@@ -66,46 +125,73 @@ class HTML {
                              // entities and prefix whitespace)
     std::string data;        // Raw data of an element that just needs to be
                              // copied as is, e.g. <script> or <style>
-    // @TODO: if the original HTML stays in memory, we could replace
-    // `attributes` and `data` with string_views pointing to it.
   };
 
+  /// Representation of markup that is being applied to a string of text. Order
+  /// matters as this represents how the tags are nested. The `Tag` objects
+  /// themselves are owned by `pool_`.
   using TagStack = std::vector<Tag *>;
 
+  /// Span of text, with which a `TagStack` is associated. A span may be empty,
+  /// for example to represent the presence of an empty or VOID element.
   struct Span {
-    size_t begin;
-    size_t end;
-    TagStack tags;  // Note: free pointers! Lifetime of tags is managed by pool_
+    size_t begin;   // Start offset in (plain text) source
+    size_t end;     // end offset in source
+    TagStack tags;  // Note: free pointers to memory owned by `pool_`.
     inline size_t size() const { return end - begin; }
   };
 
+  /// Parses HTML in `source` (if `processMarkup` is true). `source` is updated
+  /// to only contain the plain text extracted from the HTML. `HTML` instance
+  /// retains information about what tags are extracted from where to later
+  /// reconstruct the HTML in a `Response` object (both `source` and `target`).
   explicit HTML(std::string &&source, bool processMarkup) : HTML(std::move(source), processMarkup, HTML::Options{}){};
   explicit HTML(std::string &&source, bool processMarkup, Options &&options);
+
+  /// It is not save to copy a HTML instance.
+  HTML(const HTML &copy) = delete;
+
+  /// Reconstructs (not perfectly) the HTML as it was parsed from `source`,
+  /// and uses alignment information to also reconstruct the same markup in
+  /// `response.target`.
   void restore(Response &response);
 
  private:
   using SpanIterator = std::vector<HTML::Span>::const_iterator;
   using AnnotatedText = marian::bergamot::AnnotatedText;
 
+  /// Reconstructs HTML in `response.source` (passed as `in`) and makes a list
+  /// `sourceTokenSpans` that associates a `Span` with each subword in `in`.
+  /// We later use these span pointers to copy tags. They're iterators (or
+  /// pointers into a list) to be able to compare whether one span came before
+  /// or after another span.
   AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
+
+  /// Inserts the HTML into `response.target` (passed as `in`) based on
+  /// `targetTokenSpans`, which points to a `Span` for each token (subword) in
+  /// `response.target`.
   AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
-  void copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
-                    std::vector<HTML::SpanIterator> const &sourceTokenSpans,
-                    std::vector<HTML::SpanIterator> &targetTokenSpans);
-  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
-                      std::vector<HTML::SpanIterator> const &sourceTokenSpans);
+  
+  /// Utilities to test whether subword `str` is part of a word together with
+  /// the subword `prev`, or a separate word. Basically *does `str` start with
+  /// a space, but bit more complex to deal with punctuation.
   bool isContinuation(marian::string_view prev, marian::string_view str) const;
   bool isContinuation(std::string_view prev, std::string_view str) const;
-  // Allocates tag in pool_ (which then owns it) and gives a pointer to be used
-  // in TagStacks. Pointer is valid as long as this HTML instance lives on.
+  
+  /// Allocates a tag in `pool_` (which then owns it) and gives a pointer to be
+  /// used in TagStacks. Pointer is valid as long as this HTML instance lives on.
   Tag *makeTag(Tag &&tag);
 
+  /// HTML options associated with this parse.
   Options options_;
 
-  // List of text spans, and which tags are applied to them
+  /// List of spans of text in plain text `source`, and which tags are applied
+  /// to them.
   std::vector<Span> spans_;
 
-  // a pool of tags that we free when HTML goes out of scope
+  /// A pool of tags. `std::forward_list` because we do not want pointers to it
+  /// to be invalidated when new tags are allocated. This way it is easy to
+  /// deallocate them all when `HTML` goes out of scope.
   std::forward_list<Tag> pool_;
 };
 

From a81dfdf2212eaa43ddbd42167a00d24a43dc8f76 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Tue, 22 Feb 2022 19:49:52 +0000
Subject: [PATCH 27/29] Remark about 'taint'

---
 src/translator/html.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/translator/html.h b/src/translator/html.h
index 0ccaf7f97..aa9baec7b 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -21,9 +21,11 @@ struct Response;
 /// When parsing the HTML, it treats tags as markup, where a list of nested tags
 /// can be seen as a list of markups that are applicable to all the text that
 /// follows. This list is stored as a `TagStack`. Whenever an HTML tag opens or
-/// closes, a new TagStack is created to reflect that. The text between tags
-/// themselves is stored in the input variable. In `spans_`, the TagStack that
-/// is associated with a substring of that text is stored.
+/// closes, a new TagStack is created to reflect that. TagStack used to be
+/// called `Taint` because it *tainted* the text it was associated with with
+/// those tags as markup. The text between tags themselves is stored in the
+/// input variable. In `spans_`, the TagStack that is associated with a
+/// substring of that text is stored.
 /// When transferring the HTML from the source text to the translated target
 /// text, the TagStacks are first associated with each of the subwords from the
 /// source text. Using hard alignment, each subword in the source text is linked
@@ -171,13 +173,13 @@ class HTML {
   /// `targetTokenSpans`, which points to a `Span` for each token (subword) in
   /// `response.target`.
   AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
-  
+
   /// Utilities to test whether subword `str` is part of a word together with
   /// the subword `prev`, or a separate word. Basically *does `str` start with
   /// a space, but bit more complex to deal with punctuation.
   bool isContinuation(marian::string_view prev, marian::string_view str) const;
   bool isContinuation(std::string_view prev, std::string_view str) const;
-  
+
   /// Allocates a tag in `pool_` (which then owns it) and gives a pointer to be
   /// used in TagStacks. Pointer is valid as long as this HTML instance lives on.
   Tag *makeTag(Tag &&tag);

From bbfa4e3ba6a3d6ac1818a92f3a782bae71126e7c Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Tue, 22 Feb 2022 21:11:23 +0100
Subject: [PATCH 28/29] Fix the constructor situation

---
 src/translator/html.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/translator/html.h b/src/translator/html.h
index aa9baec7b..7e3260511 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -151,7 +151,10 @@ class HTML {
   explicit HTML(std::string &&source, bool processMarkup, Options &&options);
 
   /// It is not save to copy a HTML instance.
-  HTML(const HTML &copy) = delete;
+  HTML(const HTML &) = delete;
+
+  /// Moving is fine
+  HTML(HTML &&) = default;
 
   /// Reconstructs (not perfectly) the HTML as it was parsed from `source`,
   /// and uses alignment information to also reconstruct the same markup in

From ea10e9165cdc3d7b583c7449dd1bd72407e9ba69 Mon Sep 17 00:00:00 2001
From: Jelmer van der Linde <jelmer@ikhoefgeen.nl>
Date: Tue, 22 Feb 2022 21:11:45 +0100
Subject: [PATCH 29/29] Add accidentally removed private methods back to header

---
 src/translator/html.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/translator/html.h b/src/translator/html.h
index 7e3260511..c704c5904 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -183,6 +183,21 @@ class HTML {
   bool isContinuation(marian::string_view prev, marian::string_view str) const;
   bool isContinuation(std::string_view prev, std::string_view str) const;
 
+  /// Copies span pointers from the subwords/tokens from the source text to the
+  /// subwords of the target text in `targetTokenSpans` using alignment
+  /// information in `response`.
+  void copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                    std::vector<HTML::SpanIterator> const &sourceTokenSpans,
+                    std::vector<HTML::SpanIterator> &targetTokenSpans);
+
+  /// Turns the alignment scores in `response.alignments` into one source token
+  /// per target token. Has some heuristics to keep all target tokens of a
+  /// single word pointing to the same span, and prefers spans with more markup
+  /// over spans with less to try to retain as much of the input markup as
+  /// possible.
+  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
+                      std::vector<HTML::SpanIterator> const &sourceTokenSpans);
+
   /// Allocates a tag in `pool_` (which then owns it) and gives a pointer to be
   /// used in TagStacks. Pointer is valid as long as this HTML instance lives on.
   Tag *makeTag(Tag &&tag);