brucemiller · brucemiller · Jul 4, 2023 · Jul 4, 2023 · Jul 4, 2023 · Jul 4, 2023
diff --git a/lib/LaTeXML/Core/Document.pm b/lib/LaTeXML/Core/Document.pm
@@ -678,12 +678,19 @@ sub insertComment {
   my ($self, $text) = @_;
   chomp($text);
   $text =~ s/\-\-+/__/g;
-  $self->closeText_internal;    # Close any open text node.
   my $comment;
+  my $prev     = $$self{node}->lastChild;
+  my $prevtype = $prev && $prev->nodeType;
   if ($$self{node}->nodeType == XML_DOCUMENT_NODE) {
     push(@{ $$self{pending} }, $comment = $$self{document}->createComment(' ' . $text . ' ')); }
-  elsif (($comment = $$self{node}->lastChild) && ($comment->nodeType == XML_COMMENT_NODE)) {
+  elsif ($prevtype && ($prevtype == XML_COMMENT_NODE)) {
+    $comment = $prev;
     $comment->setData($comment->data . "\n     " . $text . ' '); }
+  elsif ($prevtype && ($prevtype == XML_TEXT_NODE)) {    # Put comment BEFORE text node
+    if (($comment = $prev->previousSibling) && ($comment->nodeType == XML_COMMENT_NODE)) {
+      $comment = $$self{node}->appendChild($$self{document}->createComment(' ' . $text . ' ')); }
+    else {
+      $comment = $$self{node}->insertBefore($$self{document}->createComment(' ' . $text . ' '), $prev); } }
   else {
     $comment = $$self{node}->appendChild($$self{document}->createComment(' ' . $text . ' ')); }
   return $comment; }

diff --git a/lib/LaTeXML/Core/Parameter.pm b/lib/LaTeXML/Core/Parameter.pm
@@ -134,7 +134,7 @@ sub digest {
           my ($igullet) = @_;
           $igullet->unread($value);
           my @tokens = ();
-          while (defined(my $token = $igullet->getPendingComment || $igullet->readXToken(1, 1))) {
+          while (defined(my $token = $igullet->getPendingComment || $igullet->readXToken(1))) {
             push(@tokens, $token); }
           $value = Tokens(@tokens);
           $value = $value->neutralize; }); } }

diff --git a/lib/LaTeXML/Core/Stomach.pm b/lib/LaTeXML/Core/Stomach.pm
@@ -96,7 +96,7 @@ sub digestNextBody {
   my $alignment = $STATE->lookupValue('Alignment');
   my @aug       = ();
 
-  while (defined($token = $$self{gullet}->getPendingComment || $$self{gullet}->readXToken(1, 1))) {
+  while (defined($token = $$self{gullet}->getPendingComment || $$self{gullet}->readXToken(1))) {
     if ($alignment && scalar(@LaTeXML::LIST) && (Equals($token, T_ALIGN) ||
         Equals($token, T_CS('\cr')) || Equals($token, T_CS('\hidden@cr')) ||
         Equals($token, T_CS('\hidden@crcr')))) {
@@ -132,7 +132,7 @@ sub digest {
       my $initdepth = scalar(@{ $$self{boxing} });
       local @LaTeXML::LIST = ();
       while (defined(my $token =
-            $$self{gullet}->getPendingComment || $$self{gullet}->readXToken(1, 1))) {
+            $$self{gullet}->getPendingComment || $$self{gullet}->readXToken(1))) {
         push(@LaTeXML::LIST, $self->invokeToken($token));
         last if $initdepth > scalar(@{ $$self{boxing} }); }    # if we've closed the initial mode.
       List(@LaTeXML::LIST, mode => ($ismath ? 'math' : 'text'));

diff --git a/lib/LaTeXML/Core/Tokens.pm b/lib/LaTeXML/Core/Tokens.pm
@@ -60,17 +60,21 @@ sub revert {
 # NOT for creating valid TeX (use revert or UnTeX for that!)
 sub toString {
   my ($self) = @_;
-  return join('', map { $_->toString } @$self); }
+  return join('', map { ($$_[1] == CC_COMMENT ? '' : $_->toString) } @$self); }
 
 # Methods for overloaded ops.
+
+# Compare two Tokens lists, ignoring comments & markers
 sub equals {
   my ($a, $b) = @_;
   return 0 unless defined $b && (ref $a) eq (ref $b);
   my @a = @$a;
   my @b = @$b;
-  while (@a && @b && ($a[0]->equals($b[0]))) {
-    shift(@a); shift(@b); }
-  return !(@a || @b); }
+  while (@a || @b) {
+    if (@a && (($a[0]->[1] == CC_COMMENT) || ($a[0]->[1] == CC_MARKER))) { shift(@a); next; }
+    if (@b && (($b[0]->[1] == CC_COMMENT) || ($b[0]->[1] == CC_MARKER))) { shift(@b); next; }
+    return unless @a && @b && shift(@a)->equals(shift(@b)); }
+  return 1; }
 
 sub stringify {
   my ($self) = @_;

diff --git a/lib/LaTeXML/Package/TeX.pool.ltxml b/lib/LaTeXML/Package/TeX.pool.ltxml
@@ -351,7 +351,9 @@ DefParameterType('DefToken', sub {
     my ($gullet) = @_;
     my $token = $gullet->readToken;
     while ($token && ($token->getCatcode == CC_BEGIN)) {
-      my @toks = grep { !$_->equals(T_SPACE) } $gullet->readBalanced->unlist;
+      my $cc;
+      my @toks = grep { ($cc = $$_[1]) && ($cc != CC_SPACE) && ($cc != CC_COMMENT); }
+        $gullet->readBalanced->unlist;
       $token = shift(@toks);
       $gullet->unread(@toks); }
     $token; },
@@ -3609,8 +3611,8 @@ sub pruneEmpty {
   my ($document, $node) = @_;
   # In some cases we could have e.g. a \noindent followed by a {table},
   # in which case we end up with an empty ltx:para which we can prune.
-  if (!scalar($node->childNodes)) {
-    my $prev = $node->previousSibling;
+  if (!scalar(element_nodes($node))) {
+    my $prev = element_prev($node);
     if (!$prev || ($document->getNodeQName($prev) ne 'ltx:para')) {    # If $node WAS the 1st child
       $document->addClass($node->parentNode, 'ltx_pruned_first'); }
     $node->unlinkNode; }
@@ -3899,7 +3901,9 @@ sub cleanup_Math {
             push(@texts, $space); } } }
       else {    # is XMText
         foreach my $child ($xmnode->childNodes) {
-          if ($child->nodeType != XML_ELEMENT_NODE) {    # Make sure we've got an element
+          my $t = $child->nodeType;
+          if    ($t == XML_COMMENT_NODE) { }
+          elsif ($t != XML_ELEMENT_NODE) {     # Make sure we've got an element
             push(@texts, ['ltx:text', { class => 'ltx_markedasmath' }, $child]); }
           else {
             $document->addClass($child, 'ltx_markedasmath');
@@ -4327,7 +4331,8 @@ sub scriptHandler {
     # and whether there are conflicting preceding scripts, which is an error
     # Parsing is too late!
     while (my $prev = pop(@LaTeXML::LIST)) {
-      if ($prev->getProperty('isSpace')) {
+      if (($prev->getProperty('isSpace'))
+        || (ref $prev eq 'LaTeXML::Core::Comment')) {
         $prevspace = 1;              # a space avoids double-scripts
         unshift(@putback, $prev);    # put back? assuming it will add rpadding to previous???
         next; }
@@ -4772,8 +4777,9 @@ DefMathLigature(matcher => sub { my ($document, $node) = @_;
         && ((($node->getAttribute('role') || 'UNKNOWN') eq 'UNKNOWN')
           || (($node->getAttribute('role') || 'UNKNOWN') eq 'NUMBER'))
         && (($s = $node->textContent . $s) =~ /^[0-9a-zA-Z]+$/)) {
-        $n++; $string = $s;
-        $node = $node->previousSibling; }
+        $string = $s;
+        do { $node = $node->previousSibling; $n++;
+        } while $node && ($node->nodeType == XML_COMMENT_NODE); }
       (($string =~ /^[a-zA-Z]/) && ($n > 1) ? ($n, $string, role => 'UNKNOWN', meaning => undef) : undef);
 } });
 
@@ -4831,16 +4837,15 @@ DefMathLigature(matcher => sub { my ($document, $node) = @_;
           last; } }
       # OR if XMHint with 0 <= width <= thickmuskip (5mu == ?)
       elsif ($qn eq 'ltx:XMHint') {
-##        if (($w = $node->getAttribute('width')) && ($w=Dimension($w)->valueOf) && ($w >= 0) && ($w <= $skip)) {
-##          $string = $text . $string; } # Add to string, but omit from number
         my $s;
         if (($s = $node->getAttribute('name')) && ($s = $space_chars{$s})) {
           $string = $s . $string; }
         else {
           last; } }
       else {
         last; }
-      $n++; $node = $node->previousSibling; }
+      do { $node = $node->previousSibling; $n++;
+      } while $node && ($node->nodeType == XML_COMMENT_NODE); }
     if (($n > 1) && ($number =~ /\d/)) {
       ($n, $string, meaning => $number, role => 'NUMBER'); } });
 
@@ -4922,22 +4927,22 @@ DefPrimitive('\wlog{}', sub {
     return; },
   locked => 1);
 # From plain.tex
-DefPrimitive('\newcount  Token', sub {
+DefPrimitive('\newcount  DefToken', sub {
     DefRegisterI($_[1], undef, Number(0), allocate => '\count'); });
-DefPrimitive('\newdimen  Token', sub {
+DefPrimitive('\newdimen  DefToken', sub {
     DefRegisterI($_[1], undef, Dimension(0), allocate => '\dimen'); });
-DefPrimitive('\newskip   Token', sub {
+DefPrimitive('\newskip   DefToken', sub {
     DefRegisterI($_[1], undef, Glue(0), allocate => '\skip'); });
-DefPrimitive('\newmuskip Token', sub {
+DefPrimitive('\newmuskip DefToken', sub {
     DefRegisterI($_[1], undef, MuGlue(0), allocate => '\muskip'); });
 AssignValue(allocated_boxes => 0);
 DefPrimitive('\newbox DefToken', sub {
     my $n = LookupValue('allocated_boxes');
     AssignValue(allocated_boxes => $n + 1, 'global');
     AssignValue("box$n", List());
     DefRegisterI($_[1], undef, Number($n), readonly => 1); });
-DefPrimitive('\newhelp Token {}', sub { AssignValue(ToString($_[1]) => $_[2]); });
-DefPrimitive('\newtoks Token',    sub { DefRegisterI($_[1], undef, Tokens()); });
+DefPrimitive('\newhelp DefToken {}', sub { AssignValue(ToString($_[1]) => $_[2]); });
+DefPrimitive('\newtoks DefToken',    sub { DefRegisterI($_[1], undef, Tokens()); });
 # the next 4 actually work by doing a \chardef instead of \countdef, etc.
 # which means they actually work quite differently
 DefPrimitive('\alloc@@ {}', sub {
@@ -4947,10 +4952,10 @@ DefPrimitive('\alloc@@ {}', sub {
     $n = $n->valueOf if ref $n;
     AssignValue($c => $n + 1, 'global');
     AssignRegister('\allocationnumber' => Number($n), 'global'); });
-DefMacro('\newread Token',     '\alloc@@{read}\global\chardef#1=\allocationnumber');
-DefMacro('\newwrite Token',    '\alloc@@{write}\global\chardef#1=\allocationnumber');
-DefMacro('\newfam Token',      '\alloc@@{fam}\global\chardef#1=\allocationnumber');
-DefMacro('\newlanguage Token', '\alloc@@{language}\global\chardef#1=\allocationnumber');
+DefMacro('\newread DefToken',     '\alloc@@{read}\global\chardef#1=\allocationnumber');
+DefMacro('\newwrite DefToken',    '\alloc@@{write}\global\chardef#1=\allocationnumber');
+DefMacro('\newfam DefToken',      '\alloc@@{fam}\global\chardef#1=\allocationnumber');
+DefMacro('\newlanguage DefToken', '\alloc@@{language}\global\chardef#1=\allocationnumber');
 
 DefMacro('\e@alloc{}{}{}{}{}{}',
   '\global\advance#3\@ne

diff --git a/t/tokenize/ligatures.pdf b/t/tokenize/ligatures.pdf
diff --git a/t/tokenize/ligatures.tex b/t/tokenize/ligatures.tex
@@ -7,6 +7,29 @@ \section{Text Ligatures}
 In interjection --- like this --- gets em-dash.
 
 A ``quote'' like this.
+
+\section{Ignore comments}
+%foo
+`%bar
+`Hopefully Quoted%baz
+'%qux
+'
+
+A number
+\ensuremath{%foo
+  12345.%bar
+%baz
+  67890%qux
+}
+?
+
+An --- emdash, 
+%foo
+-%bar
+-%baz
+-%qux
+perhaps?
+
 \section{Typewriter non-Ligatures}
 \texttt{LDots\ldots, versus dots ...}
 

diff --git a/t/tokenize/ligatures.xml b/t/tokenize/ligatures.xml
@@ -28,15 +28,39 @@ In interjection — like this — gets em-dash.</p>
       <tag role="refnum">2</tag>
       <tag role="typerefnum">§2</tag>
     </tags>
-    <title><tag close=" ">2</tag>Typewriter non-Ligatures</title>
+    <title><tag close=" ">2</tag>Ignore comments</title>
     <para xml:id="S2.p1">
-      <p><text font="typewriter">LDots…, versus dots ...</text></p>
+      <p>“Hopefully Quoted”</p>
     </para>
     <para xml:id="S2.p2">
+      <p>A number
+<Math mode="inline" tex="12345.67890" text="12345.67890" xml:id="S2.p2.m1">
+          <XMath>
+            <XMTok meaning="12345.67890" role="NUMBER">12345.67890</XMTok>
+          </XMath>
+        </Math>
+?</p>
+    </para>
+    <para xml:id="S2.p3">
+      <p>An — emdash,
+—perhaps?</p>
+    </para>
+  </section>
+  <section inlist="toc" xml:id="S3">
+    <tags>
+      <tag>3</tag>
+      <tag role="refnum">3</tag>
+      <tag role="typerefnum">§3</tag>
+    </tags>
+    <title><tag close=" ">3</tag>Typewriter non-Ligatures</title>
+    <para xml:id="S3.p1">
+      <p><text font="typewriter">LDots…, versus dots ...</text></p>
+    </para>
+    <para xml:id="S3.p2">
       <p><text font="typewriter">A range: 1--10 gets en-dash.</text>
 <text font="typewriter">In interjection --- like this --- gets em-dash.</text></p>
     </para>
-    <para xml:id="S2.p3">
+    <para xml:id="S3.p3">
       <p><text font="typewriter">A ‘‘quote’’ like this.</text></p>
     </para>
   </section>