diff --git a/lib/LaTeXML/Core/Document.pm b/lib/LaTeXML/Core/Document.pm index 9542c8979..989c54d37 100644 --- a/lib/LaTeXML/Core/Document.pm +++ b/lib/LaTeXML/Core/Document.pm @@ -678,12 +678,19 @@ sub insertComment { my ($self, $text) = @_; chomp($text); $text =~ s/\-\-+/__/g; - $self->closeText_internal; # Close any open text node. my $comment; + my $prev = $$self{node}->lastChild; + my $prevtype = $prev && $prev->nodeType; if ($$self{node}->nodeType == XML_DOCUMENT_NODE) { push(@{ $$self{pending} }, $comment = $$self{document}->createComment(' ' . $text . ' ')); } - elsif (($comment = $$self{node}->lastChild) && ($comment->nodeType == XML_COMMENT_NODE)) { + elsif ($prevtype && ($prevtype == XML_COMMENT_NODE)) { + $comment = $prev; $comment->setData($comment->data . "\n " . $text . ' '); } + elsif ($prevtype && ($prevtype == XML_TEXT_NODE)) { # Put comment BEFORE text node + if (($comment = $prev->previousSibling) && ($comment->nodeType == XML_COMMENT_NODE)) { + $comment = $$self{node}->appendChild($$self{document}->createComment(' ' . $text . ' ')); } + else { + $comment = $$self{node}->insertBefore($$self{document}->createComment(' ' . $text . ' '), $prev); } } else { $comment = $$self{node}->appendChild($$self{document}->createComment(' ' . $text . ' ')); } return $comment; } diff --git a/lib/LaTeXML/Core/Parameter.pm b/lib/LaTeXML/Core/Parameter.pm index 83341906d..8988b3e0a 100644 --- a/lib/LaTeXML/Core/Parameter.pm +++ b/lib/LaTeXML/Core/Parameter.pm @@ -134,7 +134,7 @@ sub digest { my ($igullet) = @_; $igullet->unread($value); my @tokens = (); - while (defined(my $token = $igullet->getPendingComment || $igullet->readXToken(1, 1))) { + while (defined(my $token = $igullet->getPendingComment || $igullet->readXToken(1))) { push(@tokens, $token); } $value = Tokens(@tokens); $value = $value->neutralize; }); } } diff --git a/lib/LaTeXML/Core/Stomach.pm b/lib/LaTeXML/Core/Stomach.pm index 4f5236e79..2ed763606 100644 --- a/lib/LaTeXML/Core/Stomach.pm +++ b/lib/LaTeXML/Core/Stomach.pm @@ -96,7 +96,7 @@ sub digestNextBody { my $alignment = $STATE->lookupValue('Alignment'); my @aug = (); - while (defined($token = $$self{gullet}->getPendingComment || $$self{gullet}->readXToken(1, 1))) { + while (defined($token = $$self{gullet}->getPendingComment || $$self{gullet}->readXToken(1))) { if ($alignment && scalar(@LaTeXML::LIST) && (Equals($token, T_ALIGN) || Equals($token, T_CS('\cr')) || Equals($token, T_CS('\hidden@cr')) || Equals($token, T_CS('\hidden@crcr')))) { @@ -132,7 +132,7 @@ sub digest { my $initdepth = scalar(@{ $$self{boxing} }); local @LaTeXML::LIST = (); while (defined(my $token = - $$self{gullet}->getPendingComment || $$self{gullet}->readXToken(1, 1))) { + $$self{gullet}->getPendingComment || $$self{gullet}->readXToken(1))) { push(@LaTeXML::LIST, $self->invokeToken($token)); last if $initdepth > scalar(@{ $$self{boxing} }); } # if we've closed the initial mode. List(@LaTeXML::LIST, mode => ($ismath ? 'math' : 'text')); diff --git a/lib/LaTeXML/Core/Tokens.pm b/lib/LaTeXML/Core/Tokens.pm index c2326ce8f..88ead1910 100644 --- a/lib/LaTeXML/Core/Tokens.pm +++ b/lib/LaTeXML/Core/Tokens.pm @@ -60,17 +60,21 @@ sub revert { # NOT for creating valid TeX (use revert or UnTeX for that!) sub toString { my ($self) = @_; - return join('', map { $_->toString } @$self); } + return join('', map { ($$_[1] == CC_COMMENT ? '' : $_->toString) } @$self); } # Methods for overloaded ops. + +# Compare two Tokens lists, ignoring comments & markers sub equals { my ($a, $b) = @_; return 0 unless defined $b && (ref $a) eq (ref $b); my @a = @$a; my @b = @$b; - while (@a && @b && ($a[0]->equals($b[0]))) { - shift(@a); shift(@b); } - return !(@a || @b); } + while (@a || @b) { + if (@a && (($a[0]->[1] == CC_COMMENT) || ($a[0]->[1] == CC_MARKER))) { shift(@a); next; } + if (@b && (($b[0]->[1] == CC_COMMENT) || ($b[0]->[1] == CC_MARKER))) { shift(@b); next; } + return unless @a && @b && shift(@a)->equals(shift(@b)); } + return 1; } sub stringify { my ($self) = @_; diff --git a/lib/LaTeXML/Package/TeX.pool.ltxml b/lib/LaTeXML/Package/TeX.pool.ltxml index d8ff3aa70..fd414127e 100644 --- a/lib/LaTeXML/Package/TeX.pool.ltxml +++ b/lib/LaTeXML/Package/TeX.pool.ltxml @@ -351,7 +351,9 @@ DefParameterType('DefToken', sub { my ($gullet) = @_; my $token = $gullet->readToken; while ($token && ($token->getCatcode == CC_BEGIN)) { - my @toks = grep { !$_->equals(T_SPACE) } $gullet->readBalanced->unlist; + my $cc; + my @toks = grep { ($cc = $$_[1]) && ($cc != CC_SPACE) && ($cc != CC_COMMENT); } + $gullet->readBalanced->unlist; $token = shift(@toks); $gullet->unread(@toks); } $token; }, @@ -3609,8 +3611,8 @@ sub pruneEmpty { my ($document, $node) = @_; # In some cases we could have e.g. a \noindent followed by a {table}, # in which case we end up with an empty ltx:para which we can prune. - if (!scalar($node->childNodes)) { - my $prev = $node->previousSibling; + if (!scalar(element_nodes($node))) { + my $prev = element_prev($node); if (!$prev || ($document->getNodeQName($prev) ne 'ltx:para')) { # If $node WAS the 1st child $document->addClass($node->parentNode, 'ltx_pruned_first'); } $node->unlinkNode; } @@ -3899,7 +3901,9 @@ sub cleanup_Math { push(@texts, $space); } } } else { # is XMText foreach my $child ($xmnode->childNodes) { - if ($child->nodeType != XML_ELEMENT_NODE) { # Make sure we've got an element + my $t = $child->nodeType; + if ($t == XML_COMMENT_NODE) { } + elsif ($t != XML_ELEMENT_NODE) { # Make sure we've got an element push(@texts, ['ltx:text', { class => 'ltx_markedasmath' }, $child]); } else { $document->addClass($child, 'ltx_markedasmath'); @@ -4327,7 +4331,8 @@ sub scriptHandler { # and whether there are conflicting preceding scripts, which is an error # Parsing is too late! while (my $prev = pop(@LaTeXML::LIST)) { - if ($prev->getProperty('isSpace')) { + if (($prev->getProperty('isSpace')) + || (ref $prev eq 'LaTeXML::Core::Comment')) { $prevspace = 1; # a space avoids double-scripts unshift(@putback, $prev); # put back? assuming it will add rpadding to previous??? next; } @@ -4772,8 +4777,9 @@ DefMathLigature(matcher => sub { my ($document, $node) = @_; && ((($node->getAttribute('role') || 'UNKNOWN') eq 'UNKNOWN') || (($node->getAttribute('role') || 'UNKNOWN') eq 'NUMBER')) && (($s = $node->textContent . $s) =~ /^[0-9a-zA-Z]+$/)) { - $n++; $string = $s; - $node = $node->previousSibling; } + $string = $s; + do { $node = $node->previousSibling; $n++; + } while $node && ($node->nodeType == XML_COMMENT_NODE); } (($string =~ /^[a-zA-Z]/) && ($n > 1) ? ($n, $string, role => 'UNKNOWN', meaning => undef) : undef); } }); @@ -4831,8 +4837,6 @@ DefMathLigature(matcher => sub { my ($document, $node) = @_; last; } } # OR if XMHint with 0 <= width <= thickmuskip (5mu == ?) elsif ($qn eq 'ltx:XMHint') { -## if (($w = $node->getAttribute('width')) && ($w=Dimension($w)->valueOf) && ($w >= 0) && ($w <= $skip)) { -## $string = $text . $string; } # Add to string, but omit from number my $s; if (($s = $node->getAttribute('name')) && ($s = $space_chars{$s})) { $string = $s . $string; } @@ -4840,7 +4844,8 @@ DefMathLigature(matcher => sub { my ($document, $node) = @_; last; } } else { last; } - $n++; $node = $node->previousSibling; } + do { $node = $node->previousSibling; $n++; + } while $node && ($node->nodeType == XML_COMMENT_NODE); } if (($n > 1) && ($number =~ /\d/)) { ($n, $string, meaning => $number, role => 'NUMBER'); } }); @@ -4922,13 +4927,13 @@ DefPrimitive('\wlog{}', sub { return; }, locked => 1); # From plain.tex -DefPrimitive('\newcount Token', sub { +DefPrimitive('\newcount DefToken', sub { DefRegisterI($_[1], undef, Number(0), allocate => '\count'); }); -DefPrimitive('\newdimen Token', sub { +DefPrimitive('\newdimen DefToken', sub { DefRegisterI($_[1], undef, Dimension(0), allocate => '\dimen'); }); -DefPrimitive('\newskip Token', sub { +DefPrimitive('\newskip DefToken', sub { DefRegisterI($_[1], undef, Glue(0), allocate => '\skip'); }); -DefPrimitive('\newmuskip Token', sub { +DefPrimitive('\newmuskip DefToken', sub { DefRegisterI($_[1], undef, MuGlue(0), allocate => '\muskip'); }); AssignValue(allocated_boxes => 0); DefPrimitive('\newbox DefToken', sub { @@ -4936,8 +4941,8 @@ DefPrimitive('\newbox DefToken', sub { AssignValue(allocated_boxes => $n + 1, 'global'); AssignValue("box$n", List()); DefRegisterI($_[1], undef, Number($n), readonly => 1); }); -DefPrimitive('\newhelp Token {}', sub { AssignValue(ToString($_[1]) => $_[2]); }); -DefPrimitive('\newtoks Token', sub { DefRegisterI($_[1], undef, Tokens()); }); +DefPrimitive('\newhelp DefToken {}', sub { AssignValue(ToString($_[1]) => $_[2]); }); +DefPrimitive('\newtoks DefToken', sub { DefRegisterI($_[1], undef, Tokens()); }); # the next 4 actually work by doing a \chardef instead of \countdef, etc. # which means they actually work quite differently DefPrimitive('\alloc@@ {}', sub { @@ -4947,10 +4952,10 @@ DefPrimitive('\alloc@@ {}', sub { $n = $n->valueOf if ref $n; AssignValue($c => $n + 1, 'global'); AssignRegister('\allocationnumber' => Number($n), 'global'); }); -DefMacro('\newread Token', '\alloc@@{read}\global\chardef#1=\allocationnumber'); -DefMacro('\newwrite Token', '\alloc@@{write}\global\chardef#1=\allocationnumber'); -DefMacro('\newfam Token', '\alloc@@{fam}\global\chardef#1=\allocationnumber'); -DefMacro('\newlanguage Token', '\alloc@@{language}\global\chardef#1=\allocationnumber'); +DefMacro('\newread DefToken', '\alloc@@{read}\global\chardef#1=\allocationnumber'); +DefMacro('\newwrite DefToken', '\alloc@@{write}\global\chardef#1=\allocationnumber'); +DefMacro('\newfam DefToken', '\alloc@@{fam}\global\chardef#1=\allocationnumber'); +DefMacro('\newlanguage DefToken', '\alloc@@{language}\global\chardef#1=\allocationnumber'); DefMacro('\e@alloc{}{}{}{}{}{}', '\global\advance#3\@ne diff --git a/t/tokenize/ligatures.pdf b/t/tokenize/ligatures.pdf index d8eb397e2..a09eb65aa 100644 Binary files a/t/tokenize/ligatures.pdf and b/t/tokenize/ligatures.pdf differ diff --git a/t/tokenize/ligatures.tex b/t/tokenize/ligatures.tex index 497f24140..14a3f8b11 100644 --- a/t/tokenize/ligatures.tex +++ b/t/tokenize/ligatures.tex @@ -7,6 +7,29 @@ \section{Text Ligatures} In interjection --- like this --- gets em-dash. A ``quote'' like this. + +\section{Ignore comments} +%foo +`%bar +`Hopefully Quoted%baz +'%qux +' + +A number +\ensuremath{%foo + 12345.%bar +%baz + 67890%qux +} +? + +An --- emdash, +%foo +-%bar +-%baz +-%qux +perhaps? + \section{Typewriter non-Ligatures} \texttt{LDots\ldots, versus dots ...} diff --git a/t/tokenize/ligatures.xml b/t/tokenize/ligatures.xml index 235dd6ee8..4bc13fae1 100644 --- a/t/tokenize/ligatures.xml +++ b/t/tokenize/ligatures.xml @@ -28,15 +28,39 @@ In interjection — like this — gets em-dash.

2 §2 - <tag close=" ">2</tag>Typewriter non-Ligatures + <tag close=" ">2</tag>Ignore comments -

LDots…, versus dots ...

+

“Hopefully Quoted”

+

A number + + + 12345.67890 + + +?

+
+ +

An — emdash, +—perhaps?

+
+ +
+ + 3 + 3 + §3 + + <tag close=" ">3</tag>Typewriter non-Ligatures + +

LDots…, versus dots ...

+
+

A range: 1--10 gets en-dash. In interjection --- like this --- gets em-dash.

- +

A ‘‘quote’’ like this.