diff --git a/bergamot-translator-tests b/bergamot-translator-tests
index 3c0f95a17..3776609ce 160000
--- a/bergamot-translator-tests
+++ b/bergamot-translator-tests
@@ -1 +1 @@
-Subproject commit 3c0f95a1775a74f5db441aa2f17ceb7437679022
+Subproject commit 3776609ce5f7a238245e303efaa007b2d5078180
diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp
index 2b71784fb..96eff5aad 100644
--- a/src/tests/units/html_tests.cpp
+++ b/src/tests/units/html_tests.cpp
@@ -172,6 +172,16 @@ TEST_CASE("Do not abort if the input is just empty element") {
   CHECK(response.target.text == "<p></p>");
 }
 
+TEST_CASE("Tag names are case insensitive") {
+  // Tests <P> vs </p> and <BR> should be recognized as a void tag <br>.
+  // <B> should be recognized as inline.
+  std::string test_str("<P><B>Spa</B>ce<BR>please?</p>");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "Spa ce\n\nplease?");
+}
+
 TEST_CASE("Test case html entities") {
   // These are all entities I would expect in innerHTML, since all other entities
   // can be encoded as UTF-8 so there's no need to encode them through &...; when
@@ -618,6 +628,72 @@ TEST_CASE("Test comment") {
   CHECK(response.target.text == test_str);
 }
 
+TEST_CASE("Test <wbr> element") {
+  std::string test_str("hel<wbr>lo");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello");
+}
+
+TEST_CASE("Test <wbr> element (case-insensitive)") {
+  std::string test_str("hel<WBR>lo");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "hello");
+}
+
+TEST_CASE("Test ignored element (nested)") {
+  std::string test_str("foo <var><var>nested</var></var> bar");
+  std::string expected_str("foo  <var><var>nested</var></var>bar");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "foo  bar");
+
+  Response response;
+  std::string sentence_str("foo  bar");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 3),  // foo
+      string_view(sentence_str.data() + 3, 1),  // _
+      string_view(sentence_str.data() + 4, 4),  // _bar
+      string_view(sentence_str.data() + 8, 0),  // ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+  CHECK(response.source.text == expected_str);
+  CHECK(response.target.text == expected_str);
+}
+
+TEST_CASE("Test ignored element (with entity)") {
+  std::string test_str("foo <var>&amp;</var> bar");
+  std::string expected_str("foo  <var>&amp;</var>bar");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "foo  bar");
+
+  Response response;
+  std::string sentence_str("foo  bar");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 3),  // foo
+      string_view(sentence_str.data() + 3, 1),  // _
+      string_view(sentence_str.data() + 4, 4),  // _bar
+      string_view(sentence_str.data() + 8, 0),  // ""
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(4)};
+
+  html.restore(response);
+  CHECK(response.source.text == expected_str);
+  CHECK(response.target.text == expected_str);
+}
+
 TEST_CASE("End-to-end translation", "[!mayfail]") {
   std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
   HTML html(std::move(input), true);
diff --git a/src/translator/annotation.h b/src/translator/annotation.h
index 785d49dfe..5a17dfcfe 100644
--- a/src/translator/annotation.h
+++ b/src/translator/annotation.h
@@ -185,6 +185,41 @@ struct AnnotatedText {
   /// Returns a ByteRange representing sentence corresponding to sentenceIdx.
   ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); }
 
+  /// Utility function to call `fun` on each word (subword token effectively) in
+  /// an `AnnotatedText`. `fun` is called with the `ByteRange`, the `string_view`
+  /// with the word, and a `bool` to indicate whether it is the last word in the
+  /// `AnnotatedText`, which is also the ending whitespace slot of AnnotatedText.
+  template <typename Fun>
+  AnnotatedText apply(Fun fun) const {
+    AnnotatedText out;
+
+    for (size_t sentenceIdx = 0; sentenceIdx < numSentences(); ++sentenceIdx) {
+      std::string sentence;
+      std::vector<ByteRange> tokens;
+
+      std::string prefix = fun(annotation.gap(sentenceIdx), gap(sentenceIdx), false);
+
+      for (size_t wordIdx = 0; wordIdx < numWords(sentenceIdx); ++wordIdx) {
+        std::string token = fun(wordAsByteRange(sentenceIdx, wordIdx), word(sentenceIdx, wordIdx), false);
+        tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
+        sentence += token;
+      }
+
+      // Convert our ByteRanges to string_views since that's what appendSentence
+      // expects
+      std::vector<marian::string_view> views(tokens.size());
+      std::transform(tokens.begin(), tokens.end(), views.begin(), [&](ByteRange const &range) {
+        return marian::string_view(sentence.data() + range.begin, range.size());
+      });
+
+      out.appendSentence(prefix, views.begin(), views.end());
+    }
+
+    out.appendEndingWhitespace(fun(annotation.gap(numSentences()), gap(numSentences()), true));
+
+    return out;
+  }
+
  private:
   string_view asStringView(const ByteRange &byteRange) const {
     return string_view(text.data() + byteRange.begin, byteRange.size());
diff --git a/src/translator/html.cpp b/src/translator/html.cpp
index 242572db0..ed42b9117 100644
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@@ -1,21 +1,23 @@
 #include "html.h"
 
+#include <algorithm>
+
 #include "response.h"
 #include "xh_scanner.h"
 
 namespace {
-using marian::string_view;
 using marian::bergamot::AnnotatedText;
 using marian::bergamot::ByteRange;
 using marian::bergamot::HTML;
 using marian::bergamot::Response;
 
-void encodeEntities(string_view const &input, std::string &output) {
+/// Encodes the minimum of HTML entities.
+void encodeEntities(marian::string_view const &input, std::string &output) {
   output.clear();
   output.reserve(input.size());  // assumes there are no entities in most cases
 
-  for (auto it = input.begin(); it != input.end(); ++it) {
-    switch (*it) {
+  for (char it : input) {
+    switch (it) {
       case '&':
         output.append("&amp;");
         break;
@@ -35,19 +37,30 @@ void encodeEntities(string_view const &input, std::string &output) {
       //   output.append("&apos;");
       //   break;
       default:
-        output.push_back(*it);
+        output.push_back(it);
         break;
     }
   }
 }
 
-size_t countPrefixWhitespaces(string_view const &input) {
+/// Counts number of whitespace characters at the start of the input. Used
+/// for determining where to insert an open or close tag.
+size_t countPrefixWhitespaces(marian::string_view const &input) {
   size_t size = 0;
   while (size < input.size() && std::isspace(input[size])) ++size;
   return size;
 }
 
-// Very simple replacement for std::format introduced in C++20
+std::string toLowerCase(std::string_view const &input) {
+  std::string out;
+  out.resize(input.size());
+  std::transform(input.begin(), input.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });
+  return out;
+}
+
+/// Very simple replacement for std::format introduced in C++20. Only supports
+/// replacing `{}` in the template string with whatever `operator<<` for that
+/// type turns it into.
 std::string format(std::string const &formatTemplate) { return formatTemplate; }
 
 template <typename Arg>
@@ -68,14 +81,14 @@ std::string format(std::string const &formatTemplate, Arg arg, Args... args) {
   return os.str();
 }
 
-// Syntactic sugar around rbegin() and rend() that allows me to write
-// `for (auto &&item : reversed(container))` instead of the needlessly verbose
-// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
+/// Syntactic sugar around rbegin() and rend() that allows me to write
+/// `for (auto &&item : reversed(container))` instead of the needlessly verbose
+/// `for (auto it = container.rbegin(); it != container.rend(); ++it)`
 template <typename T>
-class reversed {
+class Reversed {
  public:
-  typedef typename T::const_reverse_iterator iterator;
-  explicit reversed(T const &container) : container_(container){};
+  using iterator = typename T::const_reverse_iterator;
+  explicit Reversed(T const &container) : container_(container){};
   iterator begin() const { return container_.rbegin(); }
   iterator end() const { return container_.rend(); }
 
@@ -83,11 +96,10 @@ class reversed {
   T const &container_;
 };
 
-bool contains(std::unordered_set<std::string> const &set, std::string const &name) {
-  return set.find(name) != set.end();
-}
-
-void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &opening, HTML::Taint &closing) {
+/// When comparing two tag stacks, determine which tags need to be closed and
+/// opened to get from one stack to the other.
+void diffTags(HTML::TagStack const &prev, HTML::TagStack const &curr, HTML::TagStack &opening,
+              HTML::TagStack &closing) {
   opening.clear();
   closing.clear();
 
@@ -98,9 +110,11 @@ void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &ope
     if (i >= curr.size() || prev[i] != curr[i]) break;
 
   // Only nodes of type ELEMENT can have children and thus would need a closing tag.
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions)
   std::copy_if(prev.begin() + i, prev.end(), std::back_inserter(closing),
                [&](HTML::Tag *tag) { return tag->type == HTML::Tag::ELEMENT; });
 
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions)
   opening.insert(opening.end(), curr.begin() + i, curr.end());
 }
 
@@ -108,42 +122,24 @@ bool intersects(ByteRange const &range, HTML::Span const &span) {
   return range.begin <= span.end && range.end >= span.begin;
 };
 
-bool containsTag(HTML::Taint const &stack, HTML::Tag const *tag) {
+bool contains(HTML::TagNameSet const &set, std::string_view const &name) { return set.find(name) != set.end(); }
+
+bool contains(HTML::TagStack const &stack, HTML::Tag const *tag) {
   return std::find(stack.rbegin(), stack.rend(), tag) != stack.rend();
 }
 
-template <typename Fun>
-AnnotatedText apply(AnnotatedText const &in, Fun fun) {
-  AnnotatedText out;
-
-  for (size_t sentenceIdx = 0; sentenceIdx < in.numSentences(); ++sentenceIdx) {
-    std::string sentence;
-    std::vector<ByteRange> tokens;
+/// Is tag stack B an extended version of A? I.e. same tags, but maybe a few
+/// more nested deeper.
+bool extends(HTML::TagStack const &b, HTML::TagStack const &a) {
+  if (a.size() > b.size()) return false;
 
-    std::string prefix = fun(in.annotation.gap(sentenceIdx), in.gap(sentenceIdx), false);
+  for (auto i = a.begin(), j = b.begin(); i != a.end(); ++i, ++j)
+    if (*i != *j) return false;
 
-    for (size_t wordIdx = 0; wordIdx < in.numWords(sentenceIdx); ++wordIdx) {
-      std::string token = fun(in.wordAsByteRange(sentenceIdx, wordIdx), in.word(sentenceIdx, wordIdx), false);
-      tokens.push_back(ByteRange{sentence.size(), sentence.size() + token.size()});
-      sentence += token;
-    }
-
-    // Convert our ByteRanges to string_views since that's what appendSentence
-    // expects
-    // TODO: extend AnnotatedText::appendSentence to accept str + ByteRanges
-    // directly
-    std::vector<string_view> views(tokens.size());
-    std::transform(tokens.begin(), tokens.end(), views.begin(),
-                   [&](ByteRange const &range) { return string_view(sentence.data() + range.begin, range.size()); });
-
-    out.appendSentence(prefix, views.begin(), views.end());
-  }
-
-  out.appendEndingWhitespace(fun(in.annotation.gap(in.numSentences()), in.gap(in.numSentences()), true));
-
-  return out;
+  return true;
 }
 
+/// Tests whether `response` has alignment info associated with it or not.
 bool hasAlignments(Response const &response) {
   // Test for each sentence individually as a sentence may be empty (or there)
   // might be no sentences, so just testing for alignments.empty() would not be
@@ -162,11 +158,12 @@ bool hasAlignments(Response const &response) {
   return true;
 }
 
-// Little helper class to append HTML to a token
+/// Helper class to append HTML tags to a token. Also makes sure the token is
+/// encoded as valid HTML.
 class TokenFormatter {
  public:
-  explicit TokenFormatter(string_view token)
-      : html_(), offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
+  explicit TokenFormatter(marian::string_view token)
+      : offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
     // Do encoding of any entities that popped up in the translation
     encodeEntities(token, html_);
   }
@@ -174,12 +171,12 @@ class TokenFormatter {
   std::string &&html() { return std::move(html_); }
 
   // Append the markup necessary for moving from `prev` set of tags to `curr`.
-  void append(HTML::Taint const &prev, HTML::Taint const &curr) {
-    HTML::Taint opening, closing;
+  void append(HTML::TagStack const &prev, HTML::TagStack const &curr) {
+    HTML::TagStack opening, closing;
 
     diffTags(prev, curr, opening, closing);
 
-    for (HTML::Tag const *tag : reversed(closing)) {
+    for (HTML::Tag const *tag : Reversed(closing)) {
       assert(tag->type == HTML::Tag::ELEMENT);
       std::string closeTag = format("</{}>", tag->name);
       html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);
@@ -232,6 +229,8 @@ class TokenFormatter {
   bool closeLeft_;
 };
 
+/// Count the number of tokens in an AnnotatedText. Used to assert we're not
+/// running out of sync when creating vectors that describe each token.
 size_t debugCountTokens(AnnotatedText const &text) {
   size_t tokens = 1;  // for the ending gap
   for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) {
@@ -240,11 +239,87 @@ size_t debugCountTokens(AnnotatedText const &text) {
   return tokens;
 }
 
+/// Helper function that consumes a tag as if it is a special tag, except that
+/// it takes nesting into account. I.e. `<a><a></a></a>` will be consumed to the
+// last `</a>`. Assumes TT_TAG_START is already consumed, which was necessary
+/// to determine whether this was an element that needed to be ignored.
+void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) {
+  // Only full elements can be consumed this way. With void tags we don't know
+  // where to stop scanning. All other types cannot be nested anyway.
+  assert(tag.type == HTML::Tag::ELEMENT);
+
+  // TT_TAG_START is already consumed.
+  markup::Scanner::TokenType token;
+  size_t inside = 0;
+
+  // Consume the full open tag, i.e. all its attributes
+  while (!inside) {
+    token = scanner.next();
+    switch (token) {
+      case markup::Scanner::TT_ERROR:
+        ABORT("HTML parse error");
+      case markup::Scanner::TT_EOF:
+        ABORT("Did not find closing tag </{}>", name);
+      case markup::Scanner::TT_ATTRIBUTE:
+        tag.attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value());
+        break;
+      default:
+        // Not an attribute! Must be something inside the body or the closing
+        // tag already. Time to jump to the next loop.
+        ++inside;
+        break;
+    }
+  }
+
+  // Last token was something that would have triggered Scanner::scanBody(),
+  // which sets value() to start pointing at the body.
+  const char *start = scanner.start();
+
+  // Consume the rest of the HTML until (including) the final closing tag. We
+  // start with the token that caused the previous loop to fall into the default
+  // case.
+  while (inside) {
+    switch (token) {
+      case markup::Scanner::TT_ERROR:
+        ABORT("HTML parse error");
+      case markup::Scanner::TT_EOF:
+        ABORT("Did not find closing tag </{}>");
+      case markup::Scanner::TT_TAG_START:
+        // Note: Looking specifically for only our own type of tag so we don't
+        // have to care about whether other tags we encounter are void tags or
+        // not. Does assume the HTML is valid, as no stack is kept.
+        if (toLowerCase(scanner.tag()) == name) ++inside;
+        break;
+      case markup::Scanner::TT_TAG_END:
+        if (toLowerCase(scanner.tag()) == name) --inside;
+        break;
+      default:
+        break;
+    }
+
+    // Only continue scanning if we're still inside. We could have just read the
+    // TT_TAG_END token that ended this element, and we don't want to continue
+    // consuming tokens at that point.
+    if (inside) token = scanner.next();
+  }
+
+  // Only a TAG_END could have stopped the previous loop. We take the start
+  // of the final closing tag as the end of our data.
+  assert(token == markup::Scanner::TT_TAG_END);
+  const char *end = scanner.start();
+
+  // All data between the end of the first open element, and the start of the
+  // last close element, we just treat as raw data that will be printed when
+  // this tag is eventually printed.
+  assert(end >= start);
+  tag.data = std::string_view(start, end - start);
+}
+
 }  // namespace
 
 namespace marian::bergamot {
 
-// Formatters used for exception messages combined with format()
+/// Formatters used for formatting error messages in ABORT() calls.
 std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
   if (tag == nullptr) return out << "[nullptr]";
   switch (tag->type) {
@@ -262,7 +337,7 @@ std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
   return out << "[Unknown tag type]";
 }
 
-std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) {
+std::ostream &operator<<(std::ostream &out, HTML::TagStack const &tags) {
   for (auto it = tags.begin(); it != tags.end(); ++it) {
     if (it != tags.begin()) out << ' ';
     out << *it;
@@ -270,18 +345,20 @@ std::ostream &operator<<(std::ostream &out, HTML::Taint const &tags) {
   return out;
 }
 
-HTML::HTML(std::string &&source, bool process_markup, Options &&options) : options_(std::move(options)) {
-  if (!process_markup) return;
+HTML::HTML(std::string &&source, bool processMarkup, Options &&options) : options_(std::move(options)) {
+  if (!processMarkup) return;
 
   std::string original = std::move(source);
   markup::instream in(original.data(), original.data() + original.size());
   markup::Scanner scanner(in);
   source.clear();  // source is moved out of, so should be clear anyway
 
-  Tag *tag;
-  Taint stack;
-  bool addSentenceBreak = false;
-  bool addSpace = false;
+  Tag *tag = nullptr;             // current tag (after opening at least)
+  TagStack stack;                 // stack of currently open tags
+  bool addSentenceBreak = false;  // whether to add a sentence break next text segment
+  bool addWordBreak = false;      // whether to add a word break next text segment
+
+  // Starting point: an empty span with no open tags.
   spans_.push_back(Span{0, 0, {}});
 
   bool stop = false;
@@ -298,13 +375,14 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
         // If the previous segment was the open or close tag of a block element
         // we treat the text after it as a new sentence.
         if (addSentenceBreak) {
-          if (!(source.empty() || (source.size() > 2 && source.substr(source.size() - 2) == ""))) {
+          // If there isn't already a \n\n at the end of source...
+          if (source.size() >= 2 && source.substr(source.size() - 2) != "\n\n") {
             stack.push_back(makeTag({Tag::WHITESPACE}));
             // Important: span->size() == 0 to make it behave as a void element.
             // Also important: position before the \n\n tokens, not after, to
             // make it easier to remove them later through apply().
             spans_.push_back(Span{source.size(), source.size(), stack});
-            source.append("\n\n");  // TODO assumes ssplit-mode = wrapped_text
+            source.append("\n\n");  // Should work with ssplit-mode = wrapped_text
             stack.pop_back();
           }
           addSentenceBreak = false;
@@ -312,24 +390,27 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
 
         // If the previous segment was an open or close tag, it might be best
         // to add a space to make sure we don't append to the previous word.
-        if (addSpace) {
-          if (options_.substituteInlineTagsWithSpaces && !source.empty() && !std::isspace(source.back()) &&
-              !std::isspace(scanner.value()[0])) {
+        if (addWordBreak) {
+          // Only add the space when it would be inside a word. Do not add it if
+          // it would be between a word and punctuation.
+          if (options_.substituteInlineTagsWithSpaces && isContinuation(source, scanner.value())) {
             source.push_back(' ');
           }
-          addSpace = false;
+          addWordBreak = false;
         }
 
+        // Store which tags were open when this span of text was encountered.
         auto begin = source.size();
         source.append(scanner.value());
         spans_.push_back(Span{begin, source.size(), stack});
       } break;
 
       case markup::Scanner::TT_TAG_START: {
-        std::string name(scanner.tag());
+        std::string name = toLowerCase(scanner.tag());
 
         // Tag *tag is used by attribute parsing
-        tag = makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::move(name)});
+        auto type = contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT;
+        tag = makeTag({type, std::string(scanner.tag())});
 
         stack.push_back(tag);
 
@@ -341,39 +422,48 @@ HTML::HTML(std::string &&source, bool process_markup, Options &&options) : optio
           stack.pop_back();
         }
 
+        // Ignored tags have same semantics as void tags with regards to moving
+        // them around with the rest of the content.
+        if (contains(options_.ignoredTags, name)) {
+          consumeIgnoredTag(scanner, *tag, name);
+          spans_.push_back(Span{source.size(), source.size(), stack});
+          stack.pop_back();
+        }
+
         // Treat non-inline HTML tags as spaces that break up words.
-        if (!contains(options_.inlineTags, tag->name)) {
+        if (!contains(options_.inlineTags, name)) {
           addSentenceBreak = true;
-        } else {
-          addSpace = true;
+        } else if (!contains(options_.inWordTags, name)) {
+          addWordBreak = true;
         }
       } break;
 
-      case markup::Scanner::TT_TAG_END:
+      case markup::Scanner::TT_TAG_END: {
+        std::string tagName = toLowerCase(scanner.tag());
         // If this is the closing bit of a void tag, i.e. triggered by the "/>"
         // bit of "<img/>", then completely ignore it.
-        if (contains(options_.voidTags, std::string(scanner.tag()))) break;
+        if (contains(options_.voidTags, tagName)) break;
 
         ABORT_IF(stack.empty(), "Encountered more closing tags ({}) than opening tags", scanner.tag());
 
-        ABORT_IF(stack.back()->name != scanner.tag(), "Encountered unexpected closing tag </{}>, stack is {}",
-                 scanner.tag(), stack);
+        ABORT_IF(toLowerCase(stack.back()->name) != toLowerCase(scanner.tag()),
+                 "Encountered unexpected closing tag </{}>, stack is {}", scanner.tag(), stack);
 
         // What to do with "<u></u>" case, where tag is immediately closed
         // so it never makes it into the taint of any of the spans? This adds
         // an empty span so it still gets recorded in spans_.
-        if (spans_.empty() || !containsTag(spans_.back().tags, stack.back()))
+        if (spans_.empty() || !contains(spans_.back().tags, stack.back()))
           spans_.push_back(Span{source.size(), source.size(), stack});
 
         stack.pop_back();
 
         // Add space if necessary
-        if (!contains(options_.inlineTags, std::string(scanner.tag()))) {
+        if (!contains(options_.inlineTags, tagName)) {
           addSentenceBreak = true;
-        } else {
-          addSpace = true;
+        } else if (!contains(options_.inWordTags, tagName)) {
+          addWordBreak = true;
         }
-        break;
+      } break;
 
       case markup::Scanner::TT_ATTRIBUTE:
         assert(tag != nullptr);
@@ -448,10 +538,10 @@ void HTML::restore(Response &response) {
 
   // Find for every token in target the token in source that best matches.
   std::vector<std::vector<size_t>> alignments;
-  hardAlignments(response, alignments);
+  hardAlignments(response, alignments, sourceTokenSpans);
 
   std::vector<SpanIterator> targetTokenSpans;
-  copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
+  copyTagStack(response, alignments, sourceTokenSpans, targetTokenSpans);
   assert(targetTokenSpans.size() == debugCountTokens(response.target));
 
   AnnotatedText target = restoreTarget(response.target, targetTokenSpans);
@@ -466,7 +556,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
                                  // and the while-loop below will do the rest
   assert(prevIt == spans_.end() || prevIt->tags.empty());
 
-  return apply(in, [&](ByteRange range, string_view token, bool last) {
+  return in.apply([&](ByteRange range, string_view token, bool last) {
     TokenFormatter formatter(token);
 
     // Potential issue: spans and tokens can intersect, e.g.
@@ -475,9 +565,11 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
     //   spans     |1|   |2|    |3333| (so only 2 is tainted with <p><u>, others only <p>)
     //  tokens     |111111111111111|2|
     //
-    // Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
-    // Note: only relevant if isBlockElement is used. If we just insert spaces
-    // around all elements, every segment of `hello` will be a token.
+    // Now 1 covers span 1 to 3, so what taint should it get? Just `<p>`, or
+    // `<p><u>`?
+    // Note: only relevant if `substituteInlineTagsWithSpaces` is true. If we
+    // just insert spaces around all elements, every segment of `hello` will be
+    // a token.
 
     // Seek to the last span that overlaps with this token
     while (true) {
@@ -494,7 +586,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
 
     // TODO: This is just the taint of the last span, not the ones in between.
     // This makes us lose some markup of parts of tokens as described above.
-    sourceTokenSpans.push_back(prevIt);
+    sourceTokenSpans.emplace_back(prevIt);
 
     return std::move(formatter.html());
   });
@@ -503,27 +595,28 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanItera
 AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans) {
   auto prevSpan = spans_.cbegin();
   auto targetSpanIt = targetTokenSpans.begin();
+  auto straggerSpanIt = spans_.cbegin();
 
-  AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
+  AnnotatedText out = in.apply([&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
     TokenFormatter formatter(token);
 
     // First we scan through spans_ to catch up to the span assigned to this
     // token. We're only interested in empty spans (empty and void elements)
-    for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) {
+    for (; straggerSpanIt < *targetSpanIt; ++straggerSpanIt) {
       // We're only interested in empty spans or spans that would otherwise get
       // lost because they didn't align with anything between the spans in
       // targetSpanIt
       // TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
-      if (span_it->size() != 0 &&
-          std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end())
+      if (straggerSpanIt->size() != 0 &&
+          std::find(targetTokenSpans.begin(), targetTokenSpans.end(), straggerSpanIt) != targetTokenSpans.end())
         continue;
 
-      formatter.append(prevSpan->tags, span_it->tags);
+      formatter.append(prevSpan->tags, straggerSpanIt->tags);
 
       // Note: here, not in 3rd part of for-statement because we don't want to
       // set prevSpan if the continue clause at the beginning of this for-loop
       // was hit.
-      prevSpan = span_it;
+      prevSpan = straggerSpanIt;
     }
 
     // Now do the same thing but for our target set of tags. Note that we cannot
@@ -539,7 +632,7 @@ AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanItera
       // the last token of the output. But lets assume someone someday changes
       // HardAlignments(), and then this for-loop will be necessary.
       // assert((*targetSpanIt)->tags.empty());
-      formatter.append((*targetSpanIt)->tags, HTML::Taint());
+      formatter.append((*targetSpanIt)->tags, HTML::TagStack());
     }
 
     prevSpan = *targetSpanIt;
@@ -559,8 +652,9 @@ HTML::Tag *HTML::makeTag(Tag &&tag) {
   return &pool_.front();
 }
 
-void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
-                     std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
+void HTML::copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                        std::vector<SpanIterator> const &sourceTokenSpans,
+                        std::vector<SpanIterator> &targetTokenSpans) {
   size_t offset = 0;  // Sentence offset in sourceTokenSpans
 
   // Fill targetTokenSpans based on the alignments we just made up.
@@ -584,14 +678,25 @@ void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>>
 // to determine whether we should share the markup, or whether we should see
 // this token as a fresh start. This implementation will treat "hello[world]"
 // as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
-bool HTML::isContinuation(string_view prev, string_view str) {
+bool HTML::isContinuation(std::string_view prev, std::string_view str) const {
   if (options_.continuationDelimiters.empty()) return false;
   if (prev.empty() || str.empty()) return false;
   return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
          options_.continuationDelimiters.find(prev.back()) == std::string::npos;
 }
 
-void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
+bool HTML::isContinuation(marian::string_view prev, marian::string_view str) const {
+  return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
+}
+
+/// Selects for each token in `response.target` a best source token from
+/// `response.source` and writes this selection to `alignments`. The source
+/// token spans are used to also look at the markup applied to each token to
+/// figure out which source token best represents each target token.
+void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
+                          std::vector<SpanIterator> const &sourceTokenSpans) {
+  size_t offset = 0;  // sentence offset in sourceTokenSpans
+
   // For each sentence...
   for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
     alignments.emplace_back();
@@ -600,14 +705,9 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
     // Note: only search from 0 to N-1 because token N is end-of-sentence token
     // that can only align with the end-of-sentence token of the target
     for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
-      size_t maxS = 0;
-      for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
-        if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
-          maxS = s;
-        }
-      }
-
-      alignments.back().push_back(maxS);
+      alignments.back().push_back(
+          std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
+          response.alignments[sentenceIdx][t].begin());
     }
 
     // Next, we try to smooth out these selected alignments with a few heuristics
@@ -622,7 +722,14 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
         float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
         float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
 
-        if (currScore >= prevScore) {
+        TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
+        TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
+
+        // If this token has more markup, or a better score than the previous
+        // token (and they together are part of a word-ish thing) then mark
+        // this word as aligning. Otherwise just copy the alignment source of
+        // the previous token.
+        if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
           // Apply this to all previous tokens in the word
           for (size_t i = t;; --i) {
             alignments.back()[i] = currSentenceIdx;
@@ -640,6 +747,8 @@ void HTML::hardAlignments(Response const &response, std::vector<std::vector<size
 
     // Always align target end with source end
     alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
+
+    offset += response.source.numWords(sentenceIdx) + 1;  // +1 for prefix gap
   }
 }
 
diff --git a/src/translator/html.h b/src/translator/html.h
index db57c0d10..c704c5904 100644
--- a/src/translator/html.h
+++ b/src/translator/html.h
@@ -2,51 +2,123 @@
 #define SRC_BERGAMOT_HTML_H_
 
 #include <forward_list>
+#include <set>
 #include <stdexcept>
 #include <string>
-#include <unordered_set>
+#include <string_view>
 
 #include "annotation.h"
+#include "data/types.h"
 #include "definitions.h"
 
-namespace marian {
-namespace bergamot {
+namespace marian::bergamot {
 
 struct Response;
 
+/// HTML class parses and removes HTML from input text, and places it back into
+/// the translated output text.
+///
+/// When parsing the HTML, it treats tags as markup, where a list of nested tags
+/// can be seen as a list of markups that are applicable to all the text that
+/// follows. This list is stored as a `TagStack`. Whenever an HTML tag opens or
+/// closes, a new TagStack is created to reflect that. TagStack used to be
+/// called `Taint` because it *tainted* the text it was associated with with
+/// those tags as markup. The text between tags themselves is stored in the
+/// input variable. In `spans_`, the TagStack that is associated with a
+/// substring of that text is stored.
+/// When transferring the HTML from the source text to the translated target
+/// text, the TagStacks are first associated with each of the subwords from the
+/// source text. Using hard alignment, each subword in the source text is linked
+/// to a subword in the target text. The TagStacks are then copied over these
+/// links. Finally, the HTML is inserted back into the target text by for each
+/// subword, comparing the TagStack from the previous word to that word, and
+/// opening and closing elements to make up for the difference.
+///
+/// There are a couple of complexities though:
+/// 1. Not all tags can be treated as markup applied to text. For example, an
+///    `<img>` does not contain text itself. Or `<i></i>` does not. We do want
+///    those tags to remain in the output though. We do this by associating
+///    them to an empty `Span`. When inserting HTML back into the translation
+///    input or output, we keep track of where in the `spans_` vector we are,
+///    and insert any elements from empty spans that we might have skipped over
+///    because empty spans are never linked to tokens/subwords. These are
+///    *stragglers* in some parts of the code, or *void* or *empty* elements in
+///    other parts.
+/// 2. Some tags should be treated as paragraph indicators, and break up
+///    sentences. These are the usual suspects like `<p>`, but also `<li>` and
+///    `<td>`, to make sure we don't translate two table cells into a single
+///    word. This is the `addSentenceBreak` flag in the HTML parsing bit.
+///    We mark these breaks with `\n\n` in the input text and with a special
+///    WHITESPACE tag that we treat as any other void tag. Hopefully this tag
+///    moves with the added `\n\n` and it is easy for us to remove it again.
+///    (in practise it is since these only occur at the end of sentences and
+///    the end of sentences are always aligned between source and target.)
+/// 3. We treat most tags as word-breaking. We do this by adding spaces just
+///    after where we saw the open or close tag occur. If there is already
+///    some whitespace in that place, we do not add extra spaces.
+/// 4. TODO
 class HTML {
  public:
+  using TagNameSet = std::set<std::string, std::less<>>;
+
+  /// Options struct that controls how HTML is interpreted.
   struct Options {
-    // List of elements for which we do not expect a closing tag, or self-closing
-    // elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
-    // More relevant source of this list:
-    // https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
-    std::unordered_set<std::string> voidTags{"area",  "base",  "basefont", "bgsound", "br",    "col",
-                                             "embed", "frame", "hr",       "img",     "input", "keygen",
-                                             "link",  "meta",  "param",    "source",  "track", "wbr"};
-
-    std::unordered_set<std::string> inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
-                                               "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
-                                               "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
-
-    // List of characters that occur at the start of a token that indicate that
-    // the this token is probably *not* a continuation of a word. Set to empty
-    // to never mark a token as a continuation of the word.
-    // std::string continuationDelimiters = "\n ,.(){}[]";
-    std::string continuationDelimiters;
-
-    // Should we always add spaces to the places where tags used to be? I.e.
-    // `un<u>der</u>line` should become `un der line`?
+    /// List of elements for which we do not expect a closing tag, or
+    /// self-closing elements in XHTML. We do not need to see a closing tag
+    /// for these elements, and they cannot contain text or tags themselves.
+    /// See also:
+    /// https://developer.mozilla.org/en-US/docs/Glossary/Empty_element.
+    /// More relevant source of this list:
+    /// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
+    TagNameSet voidTags{"area", "base",  "basefont", "bgsound", "br",   "col",   "embed",  "frame", "hr",
+                        "img",  "input", "keygen",   "link",    "meta", "param", "source", "track", "wbr"};
+
+    /// List of elements that are treated as inline, meaning they do not break
+    /// up sentences. Any element *not* in this list will cause the text that
+    /// follows its open or close tag to be treated as a separate sentence.
+    TagNameSet inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
+                          "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
+                          "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
+
+    /// List of elements that are, regardless of `substituteInlineTagsWithSpaces`,
+    /// not substituted with spaces. Technically almost all inline elements
+    /// should be treated like this, except `<br>` maybe, But in practice it
+    /// seems to be more effective to limit this set to just that one tag that
+    /// that can only really be used *inside* words: `<wbr>`.
+    /// See also: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
+    TagNameSet inWordTags{"wbr"};
+
+    /// List of elements we copy as is, but do parse as if they're HTML because
+    /// they could be nested. For <script> we just scan for </script> because
+    /// the script tag may not be nested, but that is not the case for these
+    /// elements per se. Some tags, like <script>, are ignored at the `Scanner`
+    /// level. See `xh_scanner.cpp/Scanner::scanAttribute()`.
+    TagNameSet ignoredTags{"code", "kbd", "samp", "var", "dir", "acronym", "math"};
+
+    /// List of characters that occur at the start of a token that indicate that
+    /// the this token is probably *not* a continuation of a word. This is also
+    /// used to determine whether there should be a space after a closing tag
+    /// or not. I.e. a `.` after a `</strong>` does not need to be separated by
+    /// an extra space.
+    std::string continuationDelimiters = "\n ,.(){}[]";
+
+    /// Should we always add spaces to the places where tags used to be? I.e.
+    /// `un<u>der</u>line` should become `un der line`? This does help with
+    /// retaining tags inside words, or with odd pages that use CSS to add
+    /// spacing between a lot of tags. Cases like `<td>` and `<li>` are already
+    /// covered by treating them as sentence splitting.
     bool substituteInlineTagsWithSpaces = true;
   };
 
+  /// Represents a tag, or markup that is being applied to a string of text.
+  /// We treat all elements except `ELEMENT` as void elements or empty elements.
   struct Tag {
     enum NodeType {
-      ELEMENT,
-      VOID_ELEMENT,
-      COMMENT,
-      PROCESSING_INSTRUCTION,
-      WHITESPACE,  // negative space
+      ELEMENT,                 // <b>...</b>
+      VOID_ELEMENT,            // <img>
+      COMMENT,                 // <!-- ... -->
+      PROCESSING_INSTRUCTION,  // <?...?>
+      WHITESPACE,              // A \n\n we inserted to break a sentence.
     };
 
     NodeType type;           // Type of the node
@@ -55,48 +127,94 @@ class HTML {
                              // entities and prefix whitespace)
     std::string data;        // Raw data of an element that just needs to be
                              // copied as is, e.g. <script> or <style>
-    // @TODO: if the original HTML stays in memory, we could replace
-    // `attributes` and `data` with string_views pointing to it.
   };
 
-  using Taint = std::vector<Tag *>;
+  /// Representation of markup that is being applied to a string of text. Order
+  /// matters as this represents how the tags are nested. The `Tag` objects
+  /// themselves are owned by `pool_`.
+  using TagStack = std::vector<Tag *>;
 
+  /// Span of text, with which a `TagStack` is associated. A span may be empty,
+  /// for example to represent the presence of an empty or VOID element.
   struct Span {
-    size_t begin;
-    size_t end;
-    Taint tags;  // Note: free pointers! Lifetime of tags is managed by pool_
+    size_t begin;   // Start offset in (plain text) source
+    size_t end;     // end offset in source
+    TagStack tags;  // Note: free pointers to memory owned by `pool_`.
     inline size_t size() const { return end - begin; }
   };
 
-  explicit HTML(std::string &&source, bool process_markup) : HTML(std::move(source), process_markup, HTML::Options{}){};
-  explicit HTML(std::string &&source, bool process_markup, Options &&options);
+  /// Parses HTML in `source` (if `processMarkup` is true). `source` is updated
+  /// to only contain the plain text extracted from the HTML. `HTML` instance
+  /// retains information about what tags are extracted from where to later
+  /// reconstruct the HTML in a `Response` object (both `source` and `target`).
+  explicit HTML(std::string &&source, bool processMarkup) : HTML(std::move(source), processMarkup, HTML::Options{}){};
+  explicit HTML(std::string &&source, bool processMarkup, Options &&options);
+
+  /// It is not save to copy a HTML instance.
+  HTML(const HTML &) = delete;
+
+  /// Moving is fine
+  HTML(HTML &&) = default;
+
+  /// Reconstructs (not perfectly) the HTML as it was parsed from `source`,
+  /// and uses alignment information to also reconstruct the same markup in
+  /// `response.target`.
   void restore(Response &response);
 
  private:
   using SpanIterator = std::vector<HTML::Span>::const_iterator;
   using AnnotatedText = marian::bergamot::AnnotatedText;
 
+  /// Reconstructs HTML in `response.source` (passed as `in`) and makes a list
+  /// `sourceTokenSpans` that associates a `Span` with each subword in `in`.
+  /// We later use these span pointers to copy tags. They're iterators (or
+  /// pointers into a list) to be able to compare whether one span came before
+  /// or after another span.
   AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
+
+  /// Inserts the HTML into `response.target` (passed as `in`) based on
+  /// `targetTokenSpans`, which points to a `Span` for each token (subword) in
+  /// `response.target`.
   AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
-  void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
-                 std::vector<HTML::SpanIterator> const &sourceTokenSpans,
-                 std::vector<HTML::SpanIterator> &targetTokenSpans);
-  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments);
-  bool isContinuation(string_view prev, string_view str);
-  // Allocates tag in pool_ (which then owns it) and gives a pointer to be used
-  // in Taints. Pointer is valid as long as this HTML instance lives on.
+
+  /// Utilities to test whether subword `str` is part of a word together with
+  /// the subword `prev`, or a separate word. Basically *does `str` start with
+  /// a space, but bit more complex to deal with punctuation.
+  bool isContinuation(marian::string_view prev, marian::string_view str) const;
+  bool isContinuation(std::string_view prev, std::string_view str) const;
+
+  /// Copies span pointers from the subwords/tokens from the source text to the
+  /// subwords of the target text in `targetTokenSpans` using alignment
+  /// information in `response`.
+  void copyTagStack(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                    std::vector<HTML::SpanIterator> const &sourceTokenSpans,
+                    std::vector<HTML::SpanIterator> &targetTokenSpans);
+
+  /// Turns the alignment scores in `response.alignments` into one source token
+  /// per target token. Has some heuristics to keep all target tokens of a
+  /// single word pointing to the same span, and prefers spans with more markup
+  /// over spans with less to try to retain as much of the input markup as
+  /// possible.
+  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments,
+                      std::vector<HTML::SpanIterator> const &sourceTokenSpans);
+
+  /// Allocates a tag in `pool_` (which then owns it) and gives a pointer to be
+  /// used in TagStacks. Pointer is valid as long as this HTML instance lives on.
   Tag *makeTag(Tag &&tag);
 
+  /// HTML options associated with this parse.
   Options options_;
 
-  // List of text spans, and which tags are applied to them
+  /// List of spans of text in plain text `source`, and which tags are applied
+  /// to them.
   std::vector<Span> spans_;
 
-  // a pool of tags that we free when HTML goes out of scope
+  /// A pool of tags. `std::forward_list` because we do not want pointers to it
+  /// to be invalidated when new tags are allocated. This way it is easy to
+  /// deallocate them all when `HTML` goes out of scope.
   std::forward_list<Tag> pool_;
 };
 
-}  // namespace bergamot
-}  // namespace marian
+}  // namespace marian::bergamot
 
 #endif  // SRC_BERGAMOT_HTML_H_
diff --git a/src/translator/response_options.h b/src/translator/response_options.h
index b5867d00d..8ccfab856 100644
--- a/src/translator/response_options.h
+++ b/src/translator/response_options.h
@@ -19,7 +19,7 @@ struct ResponseOptions {
   bool qualityScores{false};  ///< Include quality-scores or not.
   bool alignment{false};      ///< Include alignments or not.
 
-  bool HTML{false};  /// Remove HTML tags from text and (TODO) insert in output.
+  bool HTML{false};  /// Remove HTML tags from text and insert in output.
 
   /// Whether to include sentenceMappings or not. Alignments require
   /// sentenceMappings and are available irrespective of this option if
diff --git a/src/translator/xh_scanner.cpp b/src/translator/xh_scanner.cpp
index 85eb7e972..724d02cb9 100644
--- a/src/translator/xh_scanner.cpp
+++ b/src/translator/xh_scanner.cpp
@@ -37,6 +37,11 @@ bool operator==(markup::string_ref const &str, const Char_t (&str2)[Len]) {
   return str.size == Len - 1 && std::memcmp(str.data, str2, Len - 1) == 0;
 }
 
+template <size_t N>
+constexpr size_t length(char const (&/*unused*/)[N]) {
+  return N - 1;
+}
+
 }  // end namespace
 
 namespace markup {
@@ -52,6 +57,8 @@ std::string_view Scanner::tag() const { return std::string_view(tagName_.data, t
 Scanner::TokenType Scanner::scanBody() {
   value_ = string_ref{input_.pos(), 0};
 
+  start_ = input_.pos();
+
   switch (input_.peek()) {
     case '\0':
       return TT_EOF;
@@ -97,15 +104,16 @@ Scanner::TokenType Scanner::scanAttribute() {
   switch (input_.peek()) {
     case '>':
       input_.consume();
-      if (equalsCaseInsensitive(tagName_, "script")) {
+
+      // Treat some elements as opaque, e.g. <script>, <style>
+      if (/*equalsCaseInsensitive(tagName_, "title") ||*/ equalsCaseInsensitive(tagName_, "script") ||
+          equalsCaseInsensitive(tagName_, "style") || equalsCaseInsensitive(tagName_, "textarea") ||
+          equalsCaseInsensitive(tagName_, "iframe") || equalsCaseInsensitive(tagName_, "noembed") ||
+          equalsCaseInsensitive(tagName_, "noscript") || equalsCaseInsensitive(tagName_, "noframes")) {
         // script is special because we want to parse the attributes,
         // but not the content
         scanFun_ = &Scanner::scanSpecial;
         return scanSpecial();
-      } else if (equalsCaseInsensitive(tagName_, "style")) {
-        // same with style
-        scanFun_ = &Scanner::scanSpecial;
-        return scanSpecial();
       } else {
         scanFun_ = &Scanner::scanBody;
         return scanBody();
@@ -198,10 +206,11 @@ Scanner::TokenType Scanner::scanAttribute() {
 // - TT_ENTITY_START
 // - TT_ERROR if unexpected character or end
 Scanner::TokenType Scanner::scanTag() {
+  start_ = input_.pos();
   if (input_.consume() != '<') return TT_ERROR;
 
-  bool is_tail = input_.peek() == '/';
-  if (is_tail) input_.consume();
+  bool isTail = input_.peek() == '/';
+  if (isTail) input_.consume();
 
   tagName_ = string_ref{input_.pos(), 0};
 
@@ -226,7 +235,7 @@ Scanner::TokenType Scanner::scanTag() {
 
   if (!input_.peek()) return TT_EOF;
 
-  if (is_tail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;
+  if (isTail) return input_.consume() == '>' ? TT_TAG_END : TT_ERROR;
 
   scanFun_ = &Scanner::scanAttribute;
   return TT_TAG_START;
@@ -234,6 +243,7 @@ Scanner::TokenType Scanner::scanTag() {
 
 Scanner::TokenType Scanner::scanEntity(TokenType parentTokenType) {
   // `entity` includes starting '&' and ending ';'
+  start_ = input_.pos();
   string_ref entity{input_.pos(), 0};
   bool hasEnd = false;
 
@@ -312,11 +322,13 @@ bool Scanner::isWhitespace(char c) {
 
 Scanner::TokenType Scanner::scanComment() {
   if (gotTail_) {
+    start_ = input_.pos() - length("-->");  // minus "-->"
     scanFun_ = &Scanner::scanBody;
     gotTail_ = false;
     return TT_COMMENT_END;
   }
 
+  start_ = input_.pos();
   value_ = string_ref{input_.pos(), 0};
 
   while (true) {
@@ -325,7 +337,7 @@ Scanner::TokenType Scanner::scanComment() {
 
     if (endsWith(value_, "-->")) {
       gotTail_ = true;
-      value_.size -= 3;
+      value_.size -= length("-->");
       break;
     }
   }
@@ -334,11 +346,13 @@ Scanner::TokenType Scanner::scanComment() {
 
 Scanner::TokenType Scanner::scanProcessingInstruction() {
   if (gotTail_) {
+    start_ = input_.pos() - length("?>");
     scanFun_ = &Scanner::scanBody;
     gotTail_ = false;
     return TT_PROCESSING_INSTRUCTION_END;
   }
 
+  start_ = input_.pos();
   value_ = string_ref{input_.pos(), 0};
 
   while (true) {
@@ -347,7 +361,7 @@ Scanner::TokenType Scanner::scanProcessingInstruction() {
 
     if (endsWith(value_, "?>")) {
       gotTail_ = true;
-      value_.size -= 2;
+      value_.size -= length("?>");
       break;
     }
   }
@@ -356,11 +370,13 @@ Scanner::TokenType Scanner::scanProcessingInstruction() {
 
 Scanner::TokenType Scanner::scanSpecial() {
   if (gotTail_) {
+    start_ = input_.pos() - (tagName_.size + length("</>"));
     scanFun_ = &Scanner::scanBody;
     gotTail_ = false;
     return TT_TAG_END;
   }
 
+  start_ = input_.pos();
   value_ = string_ref{input_.pos(), 0};
 
   while (true) {
@@ -369,17 +385,17 @@ Scanner::TokenType Scanner::scanSpecial() {
 
     // Test for </tag>
     // TODO: no whitespaces allowed? Is that okay?
-    if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + 3) {
+    if (value_.data[value_.size - 1] == '>' && value_.size >= tagName_.size + length("</>")) {
       // Test for the "</"" bit of "</tag>"
-      size_t pos_tag_start = value_.size - tagName_.size - 3;
-      if (std::memcmp(value_.data + pos_tag_start, "</", 2) != 0) continue;
+      size_t posTagStart = value_.size - tagName_.size - length("</>");
+      if (std::memcmp(value_.data + posTagStart, "</", length("</")) != 0) continue;
 
       // Test for the "tag" bit of "</tag>". Doing case insensitive compare because <I>...</i> is okay.
-      size_t pos_tag_name = value_.size - tagName_.size - 1;  // end - tag>
-      if (!equalsCaseInsensitive(value_.data + pos_tag_name, tagName_.data, tagName_.size)) continue;
+      size_t posTagName = value_.size - tagName_.size - length(">");  // end - tag>
+      if (!equalsCaseInsensitive(value_.data + posTagName, tagName_.data, tagName_.size)) continue;
 
       gotTail_ = true;
-      value_.size -= tagName_.size + 3;
+      value_.size -= tagName_.size + length("</>");
       break;
     }
   }
diff --git a/src/translator/xh_scanner.h b/src/translator/xh_scanner.h
index 14d755bbd..530df675d 100644
--- a/src/translator/xh_scanner.h
+++ b/src/translator/xh_scanner.h
@@ -83,6 +83,7 @@ class Scanner {
         tagName_{nullptr, 0},
         attributeName_{nullptr, 0},
         input_(is),
+        start_(nullptr),
         scanFun_(&Scanner::scanBody),
         gotTail_(false) {}
 
@@ -98,6 +99,8 @@ class Scanner {
   // get tag name
   std::string_view tag() const;
 
+  inline const char *start() const { return start_; }
+
  private: /* methods */
   typedef TokenType (Scanner::*ScanPtr)();
 
@@ -137,6 +140,9 @@ class Scanner {
 
   instream &input_;
 
+  // Start position of a token.
+  const char *start_;
+
   bool gotTail_;  // aux flag used in scanComment, scanSpecial, scanProcessingInstruction
 };
 }  // namespace markup