Skip to content

Commit 84266c6

Browse files
committed
Properly normalize attribute values
closes tafia#371
1 parent 0febc2b commit 84266c6

File tree

6 files changed

+254
-98
lines changed

6 files changed

+254
-98
lines changed

benches/macrobenches.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,13 @@ static SAMPLE_NS: &[u8] = include_bytes!("../tests/documents/sample_ns.xml");
1717
static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");
1818

1919
// TODO: read the namespaces too
20-
// TODO: use fully normalized attribute values
2120
fn parse_document(doc: &[u8]) -> XmlResult<()> {
2221
let mut r = Reader::from_reader(doc);
2322
loop {
2423
match r.read_event_unbuffered()? {
2524
Event::Start(e) | Event::Empty(e) => {
2625
for attr in e.attributes() {
27-
criterion::black_box(attr?.unescaped_value()?);
26+
criterion::black_box(attr?.normalized_value()?);
2827
}
2928
}
3029
Event::Text(e) => {

benches/microbenches.rs

+125-80
Original file line numberDiff line numberDiff line change
@@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) {
125125
group.finish();
126126
}
127127

128-
/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
129-
/// benchmark)
130-
fn bytes_text_unescaped(c: &mut Criterion) {
131-
let mut group = c.benchmark_group("BytesText::unescaped");
132-
group.bench_function("trim_text = false", |b| {
133-
b.iter(|| {
134-
let mut buf = Vec::new();
135-
let mut r = Reader::from_reader(SAMPLE);
136-
r.check_end_names(false).check_comments(false);
137-
let mut count = criterion::black_box(0);
138-
let mut nbtxt = criterion::black_box(0);
139-
loop {
140-
match r.read_event(&mut buf) {
141-
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
142-
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
143-
Ok(Event::Eof) => break,
144-
_ => (),
145-
}
146-
buf.clear();
147-
}
148-
assert_eq!(
149-
count, 1550,
150-
"Overall tag count in ./tests/documents/sample_rss.xml"
151-
);
152-
153-
// Windows has \r\n instead of \n
154-
#[cfg(windows)]
155-
assert_eq!(
156-
nbtxt, 67661,
157-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
158-
);
159-
160-
#[cfg(not(windows))]
161-
assert_eq!(
162-
nbtxt, 66277,
163-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
164-
);
165-
});
166-
});
167-
168-
group.bench_function("trim_text = true", |b| {
169-
b.iter(|| {
170-
let mut buf = Vec::new();
171-
let mut r = Reader::from_reader(SAMPLE);
172-
r.check_end_names(false)
173-
.check_comments(false)
174-
.trim_text(true);
175-
let mut count = criterion::black_box(0);
176-
let mut nbtxt = criterion::black_box(0);
177-
loop {
178-
match r.read_event(&mut buf) {
179-
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
180-
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
181-
Ok(Event::Eof) => break,
182-
_ => (),
183-
}
184-
buf.clear();
185-
}
186-
assert_eq!(
187-
count, 1550,
188-
"Overall tag count in ./tests/documents/sample_rss.xml"
189-
);
190-
191-
// Windows has \r\n instead of \n
192-
#[cfg(windows)]
193-
assert_eq!(
194-
nbtxt, 50334,
195-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
196-
);
197-
198-
#[cfg(not(windows))]
199-
assert_eq!(
200-
nbtxt, 50261,
201-
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
202-
);
203-
});
204-
});
205-
group.finish();
206-
}
207-
208128
/// Benchmarks, how fast individual event parsed
209129
fn one_event(c: &mut Criterion) {
210130
let mut group = c.benchmark_group("One event");
@@ -364,6 +284,130 @@ fn attributes(c: &mut Criterion) {
364284
assert_eq!(count, 150);
365285
})
366286
});
287+
288+
group.finish();
289+
}
290+
291+
/// Benchmarks normalizing attribute values
292+
fn attribute_value_normalization(c: &mut Criterion) {
293+
let mut group = c.benchmark_group("attribute_value_normalization");
294+
295+
group.bench_function("noop_short", |b| {
296+
b.iter(|| {
297+
criterion::black_box(unescape(b"foobar")).unwrap();
298+
})
299+
});
300+
301+
group.bench_function("noop_long", |b| {
302+
b.iter(|| {
303+
criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap();
304+
})
305+
});
306+
307+
group.bench_function("replacement_chars", |b| {
308+
b.iter(|| {
309+
criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap();
310+
})
311+
});
312+
313+
group.bench_function("char_reference", |b| {
314+
b.iter(|| {
315+
let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
316+
criterion::black_box(unescape(text)).unwrap();
317+
let text = b"&#38;&#60;";
318+
criterion::black_box(unescape(text)).unwrap();
319+
})
320+
});
321+
322+
group.bench_function("entity_reference", |b| {
323+
b.iter(|| {
324+
let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
325+
criterion::black_box(unescape(text)).unwrap();
326+
let text = b"&quot;what&apos;s that?&quot;";
327+
criterion::black_box(unescape(text)).unwrap();
328+
})
329+
});
330+
331+
group.finish();
332+
}
333+
334+
/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
335+
/// benchmark)
336+
fn bytes_text_unescaped(c: &mut Criterion) {
337+
let mut group = c.benchmark_group("BytesText::unescaped");
338+
group.bench_function("trim_text = false", |b| {
339+
b.iter(|| {
340+
let mut buf = Vec::new();
341+
let mut r = Reader::from_reader(SAMPLE);
342+
r.check_end_names(false).check_comments(false);
343+
let mut count = criterion::black_box(0);
344+
let mut nbtxt = criterion::black_box(0);
345+
loop {
346+
match r.read_event(&mut buf) {
347+
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
348+
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
349+
Ok(Event::Eof) => break,
350+
_ => (),
351+
}
352+
buf.clear();
353+
}
354+
assert_eq!(
355+
count, 1550,
356+
"Overall tag count in ./tests/documents/sample_rss.xml"
357+
);
358+
359+
// Windows has \r\n instead of \n
360+
#[cfg(windows)]
361+
assert_eq!(
362+
nbtxt, 67661,
363+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
364+
);
365+
366+
#[cfg(not(windows))]
367+
assert_eq!(
368+
nbtxt, 66277,
369+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
370+
);
371+
});
372+
});
373+
374+
group.bench_function("trim_text = true", |b| {
375+
b.iter(|| {
376+
let mut buf = Vec::new();
377+
let mut r = Reader::from_reader(SAMPLE);
378+
r.check_end_names(false)
379+
.check_comments(false)
380+
.trim_text(true);
381+
let mut count = criterion::black_box(0);
382+
let mut nbtxt = criterion::black_box(0);
383+
loop {
384+
match r.read_event(&mut buf) {
385+
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
386+
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
387+
Ok(Event::Eof) => break,
388+
_ => (),
389+
}
390+
buf.clear();
391+
}
392+
assert_eq!(
393+
count, 1550,
394+
"Overall tag count in ./tests/documents/sample_rss.xml"
395+
);
396+
397+
// Windows has \r\n instead of \n
398+
#[cfg(windows)]
399+
assert_eq!(
400+
nbtxt, 50334,
401+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
402+
);
403+
404+
#[cfg(not(windows))]
405+
assert_eq!(
406+
nbtxt, 50261,
407+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
408+
);
409+
});
410+
});
367411
group.finish();
368412
}
369413

@@ -477,6 +521,7 @@ criterion_group!(
477521
read_namespaced_event,
478522
one_event,
479523
attributes,
524+
attribute_value_normalization,
480525
escaping,
481526
unescaping,
482527
);

src/errors.rs

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ impl From<EscapeError> for Error {
7272
}
7373

7474
impl From<AttrError> for Error {
75+
/// Creates a new `Error::InvalidAttr` from the given error
7576
#[inline]
7677
fn from(error: AttrError) -> Self {
7778
Error::InvalidAttr(error)

src/escapei.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use std::ops::Range;
99
use pretty_assertions::assert_eq;
1010

1111
/// Error for XML escape/unescqpe.
12-
#[derive(Debug)]
12+
#[derive(Debug, PartialEq)]
1313
pub enum EscapeError {
1414
/// Entity with Null character
1515
EntityWithNull(::std::ops::Range<usize>),
@@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
134134
}
135135

136136
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
137-
/// value, using a dictionnary of custom entities.
137+
/// value, using a dictionary of custom entities.
138138
///
139139
/// # Pre-condition
140140
///
@@ -201,7 +201,7 @@ pub fn do_unescape<'a>(
201201
}
202202

203203
#[cfg(not(feature = "escape-html"))]
204-
const fn named_entity(name: &[u8]) -> Option<&str> {
204+
pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
205205
let s = match name {
206206
b"lt" => "<",
207207
b"gt" => ">",
@@ -213,7 +213,7 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
213213
Some(s)
214214
}
215215
#[cfg(feature = "escape-html")]
216-
const fn named_entity(name: &[u8]) -> Option<&str> {
216+
pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
217217
// imported from https://dev.w3.org/html5/html-author/charref
218218
let s = match name {
219219
b"Tab" => "\u{09}",
@@ -1675,12 +1675,12 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
16751675
Some(s)
16761676
}
16771677

1678-
fn push_utf8(out: &mut Vec<u8>, code: char) {
1678+
pub(crate) fn push_utf8(out: &mut Vec<u8>, code: char) {
16791679
let mut buf = [0u8; 4];
16801680
out.extend_from_slice(code.encode_utf8(&mut buf).as_bytes());
16811681
}
16821682

1683-
fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
1683+
pub(crate) fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
16841684
let code = if bytes.starts_with(b"x") {
16851685
parse_hexadecimal(&bytes[1..])
16861686
} else {
@@ -1695,7 +1695,7 @@ fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError>
16951695
}
16961696
}
16971697

1698-
fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
1698+
pub(crate) fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
16991699
// maximum code is 0x10FFFF => 6 characters
17001700
if bytes.len() > 6 {
17011701
return Err(EscapeError::TooLongHexadecimal);
@@ -1713,7 +1713,7 @@ fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
17131713
Ok(code)
17141714
}
17151715

1716-
fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
1716+
pub(crate) fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
17171717
// maximum code is 0x10FFFF = 1114111 => 7 characters
17181718
if bytes.len() > 7 {
17191719
return Err(EscapeError::TooLongDecimal);

0 commit comments

Comments
 (0)