Skip to content

Commit 4ebf21f

Browse files
authored
Fix a bug that SAX2 parser doesn't expand the predefined entities for "characters" (#168)
## Why? SAX2 parser expand user-defined entity references and character references but doesn't expand predefined entity references. ## Change - text_unnormalized.rb ``` require 'rexml/document' require 'rexml/parsers/sax2parser' require 'rexml/parsers/pullparser' require 'rexml/parsers/streamparser' xml = <<EOS <root> <A>&lt;P&gt;&#13; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;</A> </root> EOS class Listener def method_missing(name, *args) p [name, *args] end end puts "REXML(DOM)" REXML::Document.new(xml).elements.each("/root/A") {|element| puts element.text} puts "" puts "REXML(Pull)" parser = REXML::Parsers::PullParser.new(xml) while parser.has_next? res = parser.pull p res end puts "" puts "REXML(Stream)" parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse puts "" puts "REXML(SAX)" parser = REXML::Parsers::SAX2Parser.new(xml) parser.listen(Listener.new) parser.parse ``` ## Before (master) ``` $ ruby text_unnormalized.rb REXML(DOM) <I> <B> Text </B> </I> REXML(Pull) start_element: ["root", {}] text: ["\n ", "\n "] start_element: ["A", {}] text: ["&lt;P&gt;&#13; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;", "<P>\r <I> <B> Text </B> </I>"] end_element: ["A"] text: ["\n", "\n"] end_element: ["root"] end_document: [] REXML(Stream) [:tag_start, "root", {}] [:text, "\n "] [:tag_start, "A", {}] [:text, "<P>\r <I> <B> Text </B> </I>"] [:tag_end, "A"] [:text, "\n"] [:tag_end, "root"] REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, "\n "] [:progress, 9] [:start_element, nil, "A", "A", {}] [:progress, 12] [:characters, "&lt;P&gt;\r &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;"] #<= This [:progress, 74] [:end_element, nil, "A", "A"] [:progress, 78] [:characters, "\n"] [:progress, 79] [:end_element, nil, "root", "root"] [:progress, 86] [:end_document] ``` ## After(This PR) ``` $ ruby text_unnormalized.rb REXML(SAX) [:start_document] [:start_element, nil, "root", "root", {}] [:progress, 6] [:characters, "\n "] [:progress, 9] [:start_element, nil, "A", "A", {}] [:progress, 12] [:characters, "<P>\r <I> <B> Text </B> </I>"] [:progress, 74] [:end_element, nil, "A", "A"] [:progress, 78] [:characters, "\n"] [:progress, 79] [:end_element, nil, "root", "root"] [:progress, 86] [:end_document] ```
1 parent a5075c1 commit 4ebf21f

File tree

4 files changed

+31
-21
lines changed

4 files changed

+31
-21
lines changed

lib/rexml/parsers/sax2parser.rb

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -157,25 +157,8 @@ def parse
157157
end
158158
end
159159
when :text
160-
#normalized = @parser.normalize( event[1] )
161-
#handle( :characters, normalized )
162-
copy = event[1].clone
163-
164-
esub = proc { |match|
165-
if @entities.has_key?($1)
166-
@entities[$1].gsub(Text::REFERENCE, &esub)
167-
else
168-
match
169-
end
170-
}
171-
172-
copy.gsub!( Text::REFERENCE, &esub )
173-
copy.gsub!( Text::NUMERICENTITY ) {|m|
174-
m=$1
175-
m = "0#{m}" if m[0] == ?x
176-
[Integer(m)].pack('U*')
177-
}
178-
handle( :characters, copy )
160+
unnormalized = @parser.unnormalize( event[1], @entities )
161+
handle( :characters, unnormalized )
179162
when :entitydecl
180163
handle_entitydecl( event )
181164
when :processing_instruction, :comment, :attlistdecl,

lib/rexml/parsers/streamparser.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def parse
3636
@listener.tag_end( event[1] )
3737
@tag_stack.pop
3838
when :text
39-
normalized = @parser.unnormalize( event[1] )
40-
@listener.text( normalized )
39+
unnormalized = @parser.unnormalize( event[1] )
40+
@listener.text( unnormalized )
4141
when :processing_instruction
4242
@listener.instruction( *event[1,2] )
4343
when :start_doctype

test/test_pullparser.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,22 @@ def test_character_references
8282
assert_equal("B", events['b'])
8383
end
8484

85+
def test_text_entity_references
86+
source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;</a></root>'
87+
parser = REXML::Parsers::PullParser.new( source )
88+
89+
events = []
90+
while parser.has_next?
91+
event = parser.pull
92+
case event.event_type
93+
when :text
94+
events << event[1]
95+
end
96+
end
97+
98+
assert_equal(["<P> <I> <B> Text </B> </I>"], events)
99+
end
100+
85101
def test_text_content_with_line_breaks
86102
source = "<root><a>A</a><b>B\n</b><c>C\r\n</c></root>"
87103
parser = REXML::Parsers::PullParser.new( source )

test/test_sax.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ def test_entity_replacement
3131
assert_equal '--1234--', results[1]
3232
end
3333

34+
def test_characters_predefined_entities
35+
source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;</a></root>'
36+
37+
sax = Parsers::SAX2Parser.new( source )
38+
results = []
39+
sax.listen(:characters) {|x| results << x }
40+
sax.parse
41+
42+
assert_equal(["<P> <I> <B> Text </B> </I>"], results)
43+
end
44+
3445
def test_sax2
3546
File.open(fixture_path("documentation.xml")) do |f|
3647
parser = Parsers::SAX2Parser.new( f )

0 commit comments

Comments
 (0)