Skip to content

Commit b8a5f4c

Browse files
authored
Fix performance issue caused by using repeated > characters inside <?xml (#170)
A `<` is treated as a string delimiter. In certain cases, if `<` is used in succession, read and match are repeated, which slows down the process. Therefore, the following is used to read ahead to a specific part of the string in advance.
1 parent 4ebf21f commit b8a5f4c

File tree

3 files changed

+16
-4
lines changed

3 files changed

+16
-4
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ class BaseParser
125125

126126
module Private
127127
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
128+
INSTRUCTION_TERM = "?>"
128129
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
129130
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
130131
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -639,7 +640,7 @@ def parse_id_invalid_details(accept_external_id:,
639640
end
640641

641642
def process_instruction(start_position)
642-
match_data = @source.match(Private::INSTRUCTION_END, true)
643+
match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM)
643644
unless match_data
644645
message = "Invalid processing instruction node"
645646
@source.position = start_position

lib/rexml/source.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def read_until(term)
117117
def ensure_buffer
118118
end
119119

120-
def match(pattern, cons=false)
120+
def match(pattern, cons=false, term: nil)
121121
if cons
122122
@scanner.scan(pattern).nil? ? nil : @scanner
123123
else
@@ -240,7 +240,7 @@ def ensure_buffer
240240
# Note: When specifying a string for 'pattern', it must not include '>' except in the following formats:
241241
# - ">"
242242
# - "XXX>" (X is any string excluding '>')
243-
def match( pattern, cons=false )
243+
def match( pattern, cons=false, term: nil )
244244
while true
245245
if cons
246246
md = @scanner.scan(pattern)
@@ -250,7 +250,7 @@ def match( pattern, cons=false )
250250
break if md
251251
return nil if pattern.is_a?(String)
252252
return nil if @source.nil?
253-
return nil unless read
253+
return nil unless read(term)
254254
end
255255

256256
md.nil? ? nil : @scanner

test/parse/test_processing_instruction.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
require "test/unit"
2+
require "core_assertions"
3+
24
require "rexml/document"
35

46
module REXMLTests
57
class TestParseProcessinInstruction < Test::Unit::TestCase
8+
include Test::Unit::CoreAssertions
9+
610
def parse(xml)
711
REXML::Document.new(xml)
812
end
@@ -69,5 +73,12 @@ def test_after_root
6973

7074
assert_equal("abc", events[:processing_instruction])
7175
end
76+
77+
def test_gt_linear_performance
78+
seq = [10000, 50000, 100000, 150000, 200000]
79+
assert_linear_performance(seq, rehearsal: 10) do |n|
80+
REXML::Document.new('<?xml version="1.0" ' + ">" * n + ' ?>')
81+
end
82+
end
7283
end
7384
end

0 commit comments

Comments
 (0)