, others only )
// tokens |111111111111111|2|
//
- // Now 1 covers span 1 to 3, so what taint should it get? Just
, or
?
- // Note: only relevant if isBlockElement is used. If we just insert spaces
- // around all elements, every segment of `hello` will be a token.
+ // Now 1 covers span 1 to 3, so what taint should it get? Just `
`, or
+ // `
`?
+ // Note: only relevant if `substituteInlineTagsWithSpaces` is true. If we
+ // just insert spaces around all elements, every segment of `hello` will be
+ // a token.
// Seek to the last span that overlaps with this token
while (true) {
@@ -494,7 +586,7 @@ AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector const &targetTokenSpans) {
auto prevSpan = spans_.cbegin();
auto targetSpanIt = targetTokenSpans.begin();
+ auto straggerSpanIt = spans_.cbegin();
- AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
+ AnnotatedText out = in.apply([&]([[maybe_unused]] ByteRange range, string_view token, bool last) {
TokenFormatter formatter(token);
// First we scan through spans_ to catch up to the span assigned to this
// token. We're only interested in empty spans (empty and void elements)
- for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) {
+ for (; straggerSpanIt < *targetSpanIt; ++straggerSpanIt) {
// We're only interested in empty spans or spans that would otherwise get
// lost because they didn't align with anything between the spans in
// targetSpanIt
// TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
- if (span_it->size() != 0 &&
- std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end())
+ if (straggerSpanIt->size() != 0 &&
+ std::find(targetTokenSpans.begin(), targetTokenSpans.end(), straggerSpanIt) != targetTokenSpans.end())
continue;
- formatter.append(prevSpan->tags, span_it->tags);
+ formatter.append(prevSpan->tags, straggerSpanIt->tags);
// Note: here, not in 3rd part of for-statement because we don't want to
// set prevSpan if the continue clause at the beginning of this for-loop
// was hit.
- prevSpan = span_it;
+ prevSpan = straggerSpanIt;
}
// Now do the same thing but for our target set of tags. Note that we cannot
@@ -539,7 +632,7 @@ AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vectortags.empty());
- formatter.append((*targetSpanIt)->tags, HTML::Taint());
+ formatter.append((*targetSpanIt)->tags, HTML::TagStack());
}
prevSpan = *targetSpanIt;
@@ -559,8 +652,9 @@ HTML::Tag *HTML::makeTag(Tag &&tag) {
return &pool_.front();
}
-void HTML::copyTaint(Response const &response, std::vector> const &alignments,
- std::vector const &sourceTokenSpans, std::vector &targetTokenSpans) {
+void HTML::copyTagStack(Response const &response, std::vector> const &alignments,
+ std::vector const &sourceTokenSpans,
+ std::vector &targetTokenSpans) {
size_t offset = 0; // Sentence offset in sourceTokenSpans
// Fill targetTokenSpans based on the alignments we just made up.
@@ -584,14 +678,25 @@ void HTML::copyTaint(Response const &response, std::vector>
// to determine whether we should share the markup, or whether we should see
// this token as a fresh start. This implementation will treat "hello[world]"
// as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
-bool HTML::isContinuation(string_view prev, string_view str) {
+bool HTML::isContinuation(std::string_view prev, std::string_view str) const {
if (options_.continuationDelimiters.empty()) return false;
if (prev.empty() || str.empty()) return false;
return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
options_.continuationDelimiters.find(prev.back()) == std::string::npos;
}
-void HTML::hardAlignments(Response const &response, std::vector> &alignments) {
+bool HTML::isContinuation(marian::string_view prev, marian::string_view str) const {
+ return isContinuation(std::string_view(prev.data(), prev.size()), std::string_view(str.data(), str.size()));
+}
+
+/// Selects for each token in `response.target` a best source token from
+/// `response.source` and writes this selection to `alignments`. The source
+/// token spans are used to also look at the markup applied to each token to
+/// figure out which source token best represents each target token.
+void HTML::hardAlignments(Response const &response, std::vector> &alignments,
+ std::vector const &sourceTokenSpans) {
+ size_t offset = 0; // sentence offset in sourceTokenSpans
+
// For each sentence...
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
alignments.emplace_back();
@@ -600,14 +705,9 @@ void HTML::hardAlignments(Response const &response, std::vector response.alignments[sentenceIdx][t][maxS]) {
- maxS = s;
- }
- }
-
- alignments.back().push_back(maxS);
+ alignments.back().push_back(
+ std::max_element(response.alignments[sentenceIdx][t].begin(), response.alignments[sentenceIdx][t].end()) -
+ response.alignments[sentenceIdx][t].begin());
}
// Next, we try to smooth out these selected alignments with a few heuristics
@@ -622,7 +722,14 @@ void HTML::hardAlignments(Response const &response, std::vector= prevScore) {
+ TagStack const &currTagStack = sourceTokenSpans[offset + 1 + currSentenceIdx]->tags;
+ TagStack const &prevTagStack = sourceTokenSpans[offset + 1 + prevSentenceIdx]->tags;
+
+ // If this token has more markup, or a better score than the previous
+ // token (and they together are part of a word-ish thing) then mark
+ // this word as aligning. Otherwise just copy the alignment source of
+ // the previous token.
+ if (extends(currTagStack, prevTagStack) || currScore >= prevScore) {
// Apply this to all previous tokens in the word
for (size_t i = t;; --i) {
alignments.back()[i] = currSentenceIdx;
@@ -640,6 +747,8 @@ void HTML::hardAlignments(Response const &response, std::vector
+#include
#include
#include
-#include
+#include
#include "annotation.h"
+#include "data/types.h"
#include "definitions.h"
-namespace marian {
-namespace bergamot {
+namespace marian::bergamot {
struct Response;
+/// HTML class parses and removes HTML from input text, and places it back into
+/// the translated output text.
+///
+/// When parsing the HTML, it treats tags as markup, where a list of nested tags
+/// can be seen as a list of markups that are applicable to all the text that
+/// follows. This list is stored as a `TagStack`. Whenever an HTML tag opens or
+/// closes, a new TagStack is created to reflect that. TagStack used to be
+/// called `Taint` because it *tainted* the text it was associated with with
+/// those tags as markup. The text between tags themselves is stored in the
+/// input variable. In `spans_`, the TagStack that is associated with a
+/// substring of that text is stored.
+/// When transferring the HTML from the source text to the translated target
+/// text, the TagStacks are first associated with each of the subwords from the
+/// source text. Using hard alignment, each subword in the source text is linked
+/// to a subword in the target text. The TagStacks are then copied over these
+/// links. Finally, the HTML is inserted back into the target text by for each
+/// subword, comparing the TagStack from the previous word to that word, and
+/// opening and closing elements to make up for the difference.
+///
+/// There are a couple of complexities though:
+/// 1. Not all tags can be treated as markup applied to text. For example, an
+/// `` does not contain text itself. Or `` does not. We do want
+/// those tags to remain in the output though. We do this by associating
+/// them to an empty `Span`. When inserting HTML back into the translation
+/// input or output, we keep track of where in the `spans_` vector we are,
+/// and insert any elements from empty spans that we might have skipped over
+/// because empty spans are never linked to tokens/subwords. These are
+/// *stragglers* in some parts of the code, or *void* or *empty* elements in
+/// other parts.
+/// 2. Some tags should be treated as paragraph indicators, and break up
+/// sentences. These are the usual suspects like ``, but also `
` and
+/// ``, to make sure we don't translate two table cells into a single
+/// word. This is the `addSentenceBreak` flag in the HTML parsing bit.
+/// We mark these breaks with `\n\n` in the input text and with a special
+/// WHITESPACE tag that we treat as any other void tag. Hopefully this tag
+/// moves with the added `\n\n` and it is easy for us to remove it again.
+/// (in practise it is since these only occur at the end of sentences and
+/// the end of sentences are always aligned between source and target.)
+/// 3. We treat most tags as word-breaking. We do this by adding spaces just
+/// after where we saw the open or close tag occur. If there is already
+/// some whitespace in that place, we do not add extra spaces.
+/// 4. TODO
class HTML {
public:
+ using TagNameSet = std::set>;
+
+ /// Options struct that controls how HTML is interpreted.
struct Options {
- // List of elements for which we do not expect a closing tag, or self-closing
- // elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
- // More relevant source of this list:
- // https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
- std::unordered_set voidTags{"area", "base", "basefont", "bgsound", "br", "col",
- "embed", "frame", "hr", "img", "input", "keygen",
- "link", "meta", "param", "source", "track", "wbr"};
-
- std::unordered_set inlineTags{"abbr", "a", "b", "em", "i", "kbd", "mark", "math",
- "output", "q", "ruby", "small", "span", "strong", "sub", "sup",
- "time", "u", "var", "wbr", "ins", "del", "img"};
-
- // List of characters that occur at the start of a token that indicate that
- // the this token is probably *not* a continuation of a word. Set to empty
- // to never mark a token as a continuation of the word.
- // std::string continuationDelimiters = "\n ,.(){}[]";
- std::string continuationDelimiters;
-
- // Should we always add spaces to the places where tags used to be? I.e.
- // `underline` should become `un der line`?
+ /// List of elements for which we do not expect a closing tag, or
+ /// self-closing elements in XHTML. We do not need to see a closing tag
+ /// for these elements, and they cannot contain text or tags themselves.
+ /// See also:
+ /// https://developer.mozilla.org/en-US/docs/Glossary/Empty_element.
+ /// More relevant source of this list:
+ /// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
+ TagNameSet voidTags{"area", "base", "basefont", "bgsound", "br", "col", "embed", "frame", "hr",
+ "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"};
+
+ /// List of elements that are treated as inline, meaning they do not break
+ /// up sentences. Any element *not* in this list will cause the text that
+ /// follows its open or close tag to be treated as a separate sentence.
+ TagNameSet inlineTags{"abbr", "a", "b", "em", "i", "kbd", "mark", "math",
+ "output", "q", "ruby", "small", "span", "strong", "sub", "sup",
+ "time", "u", "var", "wbr", "ins", "del", "img"};
+
+ /// List of elements that are, regardless of `substituteInlineTagsWithSpaces`,
+ /// not substituted with spaces. Technically almost all inline elements
+ /// should be treated like this, except ` ` maybe, But in practice it
+ /// seems to be more effective to limit this set to just that one tag that
+ /// that can only really be used *inside* words: ``.
+ /// See also: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/wbr
+ TagNameSet inWordTags{"wbr"};
+
+ /// List of elements we copy as is, but do parse as if they're HTML because
+ /// they could be nested. For because
+ /// the script tag may not be nested, but that is not the case for these
+ /// elements per se. Some tags, like |