Skip to content

Commit 561c11b

Browse files
committed
Properly normalize attribute values
closes tafia#371
1 parent a54f6c2 commit 561c11b

File tree

7 files changed

+273
-11
lines changed

7 files changed

+273
-11
lines changed

Changelog.md

+10
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
### New Features
1414

15+
- [#379]: Support for the full XML attribute value normalization process
16+
1517
### Bug Fixes
1618

1719
- [#490]: Ensure that serialization of map keys always produces valid XML names.
@@ -65,6 +67,11 @@
6567
- [#489]: Reduced the size of the package uploaded into the crates.io by excluding
6668
tests, examples, and benchmarks.
6769

70+
### New Tests
71+
72+
- [#379]: Added tests for attribute value normalization
73+
74+
[#379]: https://github.com/tafia/quick-xml/pull/379
6875
[#481]: https://github.com/tafia/quick-xml/pull/481
6976
[#489]: https://github.com/tafia/quick-xml/pull/489
7077

@@ -105,6 +112,9 @@
105112
- [#395]: Add support for XML Schema `xs:list`
106113
- [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore
107114
the XML declared encoding and always use UTF-8
115+
- [#379]: Add full support for XML attribute value normalization via
116+
`Attribute::normalized_value()` and
117+
`Attribute::normalized_value_with_custom_entities()`
108118
- [#416]: Add `borrow()` methods in all event structs which allows to get
109119
a borrowed version of any event
110120
- [#437]: Split out namespace reading functionality to a dedicated `NsReader`, namely:

benches/macrobenches.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[
4343
("players.xml", PLAYERS),
4444
];
4545

46-
// TODO: use fully normalized attribute values
4746
fn parse_document_from_str(doc: &str) -> XmlResult<()> {
4847
let mut r = Reader::from_str(doc);
4948
loop {
5049
match criterion::black_box(r.read_event()?) {
5150
Event::Start(e) | Event::Empty(e) => {
5251
for attr in e.attributes() {
53-
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
52+
criterion::black_box(attr?.normalized_value()?);
5453
}
5554
}
5655
Event::Text(e) => {

benches/microbenches.rs

+125
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,130 @@ fn attributes(c: &mut Criterion) {
242242
assert_eq!(count, 150);
243243
})
244244
});
245+
246+
group.finish();
247+
}
248+
249+
/// Benchmarks normalizing attribute values
250+
fn attribute_value_normalization(c: &mut Criterion) {
251+
let mut group = c.benchmark_group("attribute_value_normalization");
252+
253+
group.bench_function("noop_short", |b| {
254+
b.iter(|| {
255+
criterion::black_box(unescape(b"foobar")).unwrap();
256+
})
257+
});
258+
259+
group.bench_function("noop_long", |b| {
260+
b.iter(|| {
261+
criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap();
262+
})
263+
});
264+
265+
group.bench_function("replacement_chars", |b| {
266+
b.iter(|| {
267+
criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap();
268+
})
269+
});
270+
271+
group.bench_function("char_reference", |b| {
272+
b.iter(|| {
273+
let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
274+
criterion::black_box(unescape(text)).unwrap();
275+
let text = b"&#38;&#60;";
276+
criterion::black_box(unescape(text)).unwrap();
277+
})
278+
});
279+
280+
group.bench_function("entity_reference", |b| {
281+
b.iter(|| {
282+
let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
283+
criterion::black_box(unescape(text)).unwrap();
284+
let text = b"&quot;what&apos;s that?&quot;";
285+
criterion::black_box(unescape(text)).unwrap();
286+
})
287+
});
288+
289+
group.finish();
290+
}
291+
292+
/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
293+
/// benchmark)
294+
fn bytes_text_unescaped(c: &mut Criterion) {
295+
let mut group = c.benchmark_group("BytesText::unescaped");
296+
group.bench_function("trim_text = false", |b| {
297+
b.iter(|| {
298+
let mut buf = Vec::new();
299+
let mut r = Reader::from_reader(SAMPLE);
300+
r.check_end_names(false).check_comments(false);
301+
let mut count = criterion::black_box(0);
302+
let mut nbtxt = criterion::black_box(0);
303+
loop {
304+
match r.read_event(&mut buf) {
305+
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
306+
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
307+
Ok(Event::Eof) => break,
308+
_ => (),
309+
}
310+
buf.clear();
311+
}
312+
assert_eq!(
313+
count, 1550,
314+
"Overall tag count in ./tests/documents/sample_rss.xml"
315+
);
316+
317+
// Windows has \r\n instead of \n
318+
#[cfg(windows)]
319+
assert_eq!(
320+
nbtxt, 67661,
321+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
322+
);
323+
324+
#[cfg(not(windows))]
325+
assert_eq!(
326+
nbtxt, 66277,
327+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
328+
);
329+
});
330+
});
331+
332+
group.bench_function("trim_text = true", |b| {
333+
b.iter(|| {
334+
let mut buf = Vec::new();
335+
let mut r = Reader::from_reader(SAMPLE);
336+
r.check_end_names(false)
337+
.check_comments(false)
338+
.trim_text(true);
339+
let mut count = criterion::black_box(0);
340+
let mut nbtxt = criterion::black_box(0);
341+
loop {
342+
match r.read_event(&mut buf) {
343+
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
344+
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
345+
Ok(Event::Eof) => break,
346+
_ => (),
347+
}
348+
buf.clear();
349+
}
350+
assert_eq!(
351+
count, 1550,
352+
"Overall tag count in ./tests/documents/sample_rss.xml"
353+
);
354+
355+
// Windows has \r\n instead of \n
356+
#[cfg(windows)]
357+
assert_eq!(
358+
nbtxt, 50334,
359+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
360+
);
361+
362+
#[cfg(not(windows))]
363+
assert_eq!(
364+
nbtxt, 50261,
365+
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
366+
);
367+
});
368+
});
245369
group.finish();
246370
}
247371

@@ -354,6 +478,7 @@ criterion_group!(
354478
read_resolved_event_into,
355479
one_event,
356480
attributes,
481+
attribute_value_normalization,
357482
escaping,
358483
unescaping,
359484
);

src/errors.rs

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ impl From<EscapeError> for Error {
7474
}
7575

7676
impl From<AttrError> for Error {
77+
/// Creates a new `Error::InvalidAttr` from the given error
7778
#[inline]
7879
fn from(error: AttrError) -> Self {
7980
Error::InvalidAttr(error)

src/escapei.rs

+5-6
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::ops::Range;
88
use pretty_assertions::assert_eq;
99

1010
/// Error for XML escape / unescape.
11-
#[derive(Debug)]
11+
#[derive(Debug, PartialEq)]
1212
pub enum EscapeError {
1313
/// Entity with Null character
1414
EntityWithNull(Range<usize>),
@@ -228,9 +228,8 @@ fn named_entity(name: &str) -> Option<&str> {
228228
#[cfg(feature = "escape-html")]
229229
fn named_entity(name: &str) -> Option<&str> {
230230
// imported from https://dev.w3.org/html5/html-author/charref
231-
// match over strings are not allowed in const functions
232231
//TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
233-
let s = match name.as_bytes() {
232+
match name.as_bytes() {
234233
b"Tab" => "\u{09}",
235234
b"NewLine" => "\u{0A}",
236235
b"excl" => "\u{21}",
@@ -1690,7 +1689,7 @@ fn named_entity(name: &str) -> Option<&str> {
16901689
Some(s)
16911690
}
16921691

1693-
fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
1692+
pub(crate) fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
16941693
let code = if bytes.starts_with('x') {
16951694
parse_hexadecimal(&bytes[1..])
16961695
} else {
@@ -1705,7 +1704,7 @@ fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
17051704
}
17061705
}
17071706

1708-
fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
1707+
pub(crate) fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
17091708
// maximum code is 0x10FFFF => 6 characters
17101709
if bytes.len() > 6 {
17111710
return Err(EscapeError::TooLongHexadecimal);
@@ -1723,7 +1722,7 @@ fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
17231722
Ok(code)
17241723
}
17251724

1726-
fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
1725+
pub(crate) fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
17271726
// maximum code is 0x10FFFF = 1114111 => 7 characters
17281727
if bytes.len() > 7 {
17291728
return Err(EscapeError::TooLongDecimal);

src/events/attributes.rs

+130-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
55
use crate::errors::Result as XmlResult;
66
use crate::escape::{escape, unescape_with};
7+
use crate::escapei::{self, EscapeError};
78
use crate::name::QName;
89
use crate::reader::{is_whitespace, Reader};
910
use crate::utils::{write_byte_string, write_cow_string, Bytes};
@@ -30,7 +31,84 @@ pub struct Attribute<'a> {
3031
}
3132

3233
impl<'a> Attribute<'a> {
33-
/// Decodes using UTF-8 then unescapes the value.
34+
/// Returns the attribute value normalized as per the XML specification.
35+
///
36+
/// https://www.w3.org/TR/xml/#AVNormalize
37+
///
38+
/// Do not use this method with HTML attributes.
39+
///
40+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
41+
/// and the characters \t, \r, \n are replaced with whitespace characters.
42+
///
43+
/// This will allocate unless the raw attribute value does not require normalization.
44+
///
45+
/// See also [`normalized_value_with_custom_entities()`](#method.normalized_value_with_custom_entities)
46+
pub fn normalized_value(&'a self) -> Result<Cow<'a, str>, EscapeError> {
47+
self.normalized_value_with(|_| None)
48+
}
49+
50+
/// Returns the attribute value normalized as per the XML specification, using custom entities.
51+
///
52+
/// https://www.w3.org/TR/xml/#AVNormalize
53+
///
54+
/// Do not use this method with HTML attributes.
55+
///
56+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
57+
/// and the characters \t, \r, \n are replaced with whitespace characters.
58+
/// Additional entities can be provided in `custom_entities`.
59+
///
60+
/// This will allocate unless the raw attribute value does not require normalization.
61+
///
62+
/// See also [`normalized_value()`](#method.normalized_value)
63+
///
64+
/// # Pre-condition
65+
///
66+
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
67+
pub fn normalized_value_with<'entity>(
68+
&'a self,
69+
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
70+
) -> Result<Cow<'a, str>, EscapeError> {
71+
// TODO: avoid allocation when not needed
72+
let mut normalized: Vec<u8> = Vec::with_capacity(self.value.len());
73+
74+
let attr = self.value.as_ref();
75+
let mut attr_iter = attr.iter().enumerate();
76+
77+
while let Some((idx, ch)) = attr_iter.next() {
78+
match ch {
79+
b' ' | b'\n' | b'\r' | b'\t' => normalized.push(b' '),
80+
b'&' => {
81+
let end = idx
82+
+ 1
83+
+ attr_iter
84+
.position(|(_, c)| *c == b';')
85+
.ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?;
86+
let entity = &attr[idx + 1..end]; // starts after the &
87+
88+
if let Some(s) = escapei::named_entity(entity) {
89+
normalized.extend_from_slice(s.as_bytes());
90+
} else if entity.starts_with(b"#") {
91+
let entity = &entity[1..]; // starts after the #
92+
let codepoint = escapei::parse_number(entity, idx..end)?;
93+
escapei::push_utf8(&mut normalized, codepoint);
94+
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(entity)) {
95+
// TODO: recursively apply entity substitution
96+
normalized.extend_from_slice(&value);
97+
} else {
98+
return Err(EscapeError::UnrecognizedSymbol(
99+
idx + 1..end,
100+
String::from_utf8(entity.to_vec()),
101+
));
102+
}
103+
}
104+
_ => normalized.push(*ch),
105+
}
106+
}
107+
108+
Ok(Cow::Owned(normalized))
109+
}
110+
111+
/// Returns the unescaped value.
34112
///
35113
/// This is normally the value you are interested in. Escape sequences such as `&gt;` are
36114
/// replaced with their unescaped equivalents such as `>`.
@@ -791,6 +869,57 @@ mod xml {
791869
use super::*;
792870
use pretty_assertions::assert_eq;
793871

872+
#[test]
873+
fn attribute_value_normalization() {
874+
// empty value
875+
let raw_value = "".as_bytes();
876+
let output = "".as_bytes().to_vec();
877+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
878+
assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
879+
880+
// return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with a space character
881+
let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes();
882+
let output = " foo bar baz delta ".as_bytes().to_vec();
883+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
884+
assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
885+
886+
// entities must be terminated
887+
let raw_value = "abc&quotdef".as_bytes();
888+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
889+
assert_eq!(
890+
attr.normalized_value(),
891+
Err(EscapeError::UnterminatedEntity(3..11))
892+
);
893+
894+
// unknown entities raise error
895+
let raw_value = "abc&unkn;def".as_bytes();
896+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
897+
assert_eq!(
898+
attr.normalized_value(),
899+
Err(EscapeError::UnrecognizedSymbol(4..8, Ok("unkn".to_owned()))) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate. shared with unescape code
900+
);
901+
902+
// // custom entity replacement works, entity replacement text processed recursively
903+
// let raw_value = "&d;&d;A&a;&#x20;&a;B&da;".as_bytes();
904+
// let output = b" A B ".to_vec();
905+
// let attr = Attribute::from(("foo".as_bytes(), raw_value));
906+
// let mut custom_entities = HashMap::new();
907+
// custom_entities.insert(b"d".to_vec(), b"&#xD;".to_vec());
908+
// custom_entities.insert(b"a".to_vec(), b"&#xA;".to_vec());
909+
// custom_entities.insert(b"da".to_vec(), b"&#xD;&#xA;".to_vec());
910+
// dbg!(std::str::from_utf8(attr.normalized_value_with_custom_entities(&custom_entities).unwrap().as_ref()).unwrap());
911+
// assert_eq!(
912+
// attr.normalized_value_with_custom_entities(&custom_entities),
913+
// Ok(Cow::Owned::<[u8]>(output))
914+
// );
915+
916+
// character literal references are substituted without being replaced by spaces
917+
let raw_value = "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;".as_bytes();
918+
let output = "\r\rA\n\nB\r\n".as_bytes().to_vec();
919+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
920+
assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
921+
}
922+
794923
/// Checked attribute is the single attribute
795924
mod single {
796925
use super::*;

0 commit comments

Comments
 (0)