Skip to content

Commit

Permalink
fix: Extra content at the end of the document (#161)
Browse files Browse the repository at this point in the history
## Why?

XML with additional content at the end of the document is invalid.

https://www.w3.org/TR/2006/REC-xml11-20060816/#document

```
[1]   document   ::=   ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc

```
[27]    Misc       ::=          Comment | PI | S
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI

```
[16]    PI         ::=          '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget

```
[17]    PITarget           ::=          Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
```
  • Loading branch information
naitoh authored Jul 7, 2024
1 parent face9dd commit eb45c8d
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 7 deletions.
9 changes: 9 additions & 0 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -460,15 +460,24 @@ def pull_event
@closed = tag
@nsstack.shift
else
if @tags.empty? and @have_root
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
end
@tags.push( tag )
end
@have_root = true
return [ :start_element, tag, attributes ]
end
else
text = @source.read_until("<")
if text.chomp!("<")
@source.position -= "<".bytesize
end
if @tags.empty? and @have_root
unless /\A\s*\z/.match?(text)
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
end
end
return [ :text, text ]
end
rescue REXML::UndefinedNamespaceException
Expand Down
12 changes: 12 additions & 0 deletions test/parse/test_comment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -105,5 +105,17 @@ def test_after_doctype_malformed_comment_end
DETAIL
end
end

def test_after_root
parser = REXML::Parsers::BaseParser.new('<a></a><!-- ok comment -->')

events = {}
while parser.has_next?
event = parser.pull
events[event[0]] = event[1]
end

assert_equal(" ok comment ", events[:comment])
end
end
end
34 changes: 34 additions & 0 deletions test/parse/test_element.rb
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,40 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start
</ </x>
DETAIL
end

def test_after_root
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('<a></a><b>')
while parser.has_next?
parser.pull
end
end

assert_equal(<<~DETAIL.chomp, exception.to_s)
Malformed XML: Extra tag at the end of the document (got '<b')
Line: 1
Position: 10
Last 80 unconsumed characters:
DETAIL
end

def test_after_empty_element_tag_root
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('<a/><b>')
while parser.has_next?
parser.pull
end
end

assert_equal(<<~DETAIL.chomp, exception.to_s)
Malformed XML: Extra tag at the end of the document (got '<b')
Line: 1
Position: 7
Last 80 unconsumed characters:
DETAIL
end
end
end
end
12 changes: 12 additions & 0 deletions test/parse/test_processing_instruction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,17 @@ def test_garbage_text
])
end
end

def test_after_root
parser = REXML::Parsers::BaseParser.new('<a></a><?abc version="1.0" ?>')

events = {}
while parser.has_next?
event = parser.pull
events[event[0]] = event[1]
end

assert_equal("abc", events[:processing_instruction])
end
end
end
25 changes: 25 additions & 0 deletions test/parse/test_text.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
require "test/unit"
require 'rexml/parsers/baseparser'

module REXMLTests
class TestParseText < Test::Unit::TestCase
class TestInvalid < self
def test_after_root
exception = assert_raise(REXML::ParseException) do
parser = REXML::Parsers::BaseParser.new('<a></a>c')
while parser.has_next?
parser.pull
end
end

assert_equal(<<~DETAIL.chomp, exception.to_s)
Malformed XML: Extra content at the end of the document (got 'c')
Line: 1
Position: 8
Last 80 unconsumed characters:
DETAIL
end
end
end
end
14 changes: 7 additions & 7 deletions test/test_pullparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,23 +63,23 @@ def test_entity_replacement
end

def test_character_references
source = '<a>&#65;</a><b>&#x42;</b>'
source = '<root><a>&#65;</a><b>&#x42;</b></root>'
parser = REXML::Parsers::PullParser.new( source )

events = {}
element_name = ''
while parser.has_next?
event = parser.pull
case event.event_type
when :start_element
element_name = event[0]
when :text
case element_name
when 'a'
assert_equal('A', event[1])
when 'b'
assert_equal('B', event[1])
end
events[element_name] = event[1]
end
end

assert_equal('A', events['a'])
assert_equal("B", events['b'])
end

def test_text_content_with_line_breaks
Expand Down

0 comments on commit eb45c8d

Please sign in to comment.