From 9f1415a2616c77cad44a176eee90e8457b4774b6 Mon Sep 17 00:00:00 2001 From: Watson Date: Tue, 16 Jul 2024 11:04:40 +0900 Subject: [PATCH] Fix performance issue caused by using repeated `>` characters inside `CDATA [ PAYLOAD ]` (#172) A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance. --- lib/rexml/parsers/baseparser.rb | 3 ++- test/parse/test_cdata.rb | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 test/parse/test_cdata.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index ba205175..e2c0fd80 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -127,6 +127,7 @@ module Private INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um INSTRUCTION_TERM = "?>" COMMENT_TERM = "-->" + CDATA_TERM = "]]>" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -431,7 +432,7 @@ def pull_event return [ :comment, md[1] ] else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb new file mode 100644 index 00000000..9e8fa8b2 --- /dev/null +++ b/test/parse/test_cdata.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCData < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_gt_linear_performance + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ]]>') + end + end + end +end