Skip to content

Commit ae266a1

Browse files
committed
Properly normalize attribute values
closes tafia#371
1 parent d872771 commit ae266a1

File tree

2 files changed

+180
-41
lines changed

2 files changed

+180
-41
lines changed

src/events/attributes.rs

+125-1
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,96 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
331331
}
332332
}
333333

334+
// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way.
335+
// 2) Begin with a normalized value consisting of the empty string.
336+
// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following:
337+
// * For a character reference, append the referenced character to the normalized value.
338+
// * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity.
339+
// * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value.
340+
// * For another character, append the character to the normalized value.
341+
//
342+
// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters,
343+
// and by replacing sequences of space (#x20) characters by a single space (#x20) character.
344+
//
345+
// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced
346+
// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a
347+
// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a
348+
// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value.
349+
fn normalize_attribute_value(attr: Cow<[u8]>) -> Cow<[u8]> {
350+
// TODO: character references, entity references
351+
// TODO: don't allocated unless needed?
352+
353+
#[derive(PartialEq)]
354+
enum ParseState {
355+
SpaceOrStart,
356+
CDATA,
357+
}
358+
359+
let mut value: Vec<u8> = Vec::new();
360+
// Starting in the state where we think we've added a space means we implicitly skip leading spaces
361+
let mut current_state = ParseState::SpaceOrStart;
362+
// Used for trimming trailing spaces
363+
let mut last_cdata_idx = 0;
364+
365+
// In one pass, strip leading and trailing spaces and replace sequences of spaces with a single one
366+
for ch in attr.as_ref() {
367+
match current_state {
368+
ParseState::SpaceOrStart => match ch {
369+
b'\n' | b'\r' | b'\t' | b' ' => continue,
370+
c @ _ => {
371+
current_state = ParseState::CDATA;
372+
last_cdata_idx = value.len();
373+
value.push(*c);
374+
}
375+
},
376+
ParseState::CDATA => match ch {
377+
b'\n' | b'\r' | b'\t' | b' ' => {
378+
current_state = ParseState::SpaceOrStart;
379+
value.push(b' ');
380+
}
381+
c @ _ => {
382+
last_cdata_idx = value.len();
383+
value.push(*c)
384+
}
385+
},
386+
}
387+
}
388+
389+
// Trim any trailing spaces
390+
if current_state == ParseState::SpaceOrStart {
391+
value.truncate(last_cdata_idx + 1);
392+
}
393+
394+
Cow::Owned(value)
395+
396+
397+
// let mut value: Vec<u8> = Vec::new();
398+
399+
// // TODO: replace sequences of spaces
400+
// for i in 0..attr.len() {
401+
// let ch = attr[i];
402+
// match ch {
403+
// b'\n' => value.push(b' '),
404+
// b'\r' => value.push(b' '),
405+
// b'\t' => value.push(b' '),
406+
// c @ _ => value.push(c),
407+
// }
408+
// }
409+
410+
// // Position where value starts after whitespace.
411+
// let first_non_space_char = value
412+
// .iter()
413+
// .position(|c| !c.is_ascii_whitespace())
414+
// .unwrap_or(0);
415+
// // Position where the trailing whitespace starts.
416+
// let last_non_space_char = value
417+
// .iter()
418+
// .rposition(|c| !c.is_ascii_whitespace())
419+
// .and_then(|idx| Some(idx + 1))
420+
// .unwrap_or(0);
421+
// Cow::Owned(value[first_non_space_char..last_non_space_char].to_vec())
422+
}
423+
334424
impl<'a> Iterator for Attributes<'a> {
335425
type Item = Result<Attribute<'a>>;
336426
fn next(&mut self) -> Option<Self::Item> {
@@ -355,7 +445,7 @@ impl<'a> Iterator for Attributes<'a> {
355445
($key:expr, $val:expr) => {
356446
Some(Ok(Attribute {
357447
key: &self.bytes[$key],
358-
value: Cow::Borrowed(&self.bytes[$val]),
448+
value: normalize_attribute_value(Cow::Borrowed(&self.bytes[$val])),
359449
}))
360450
};
361451
}
@@ -513,4 +603,38 @@ mod tests {
513603
assert_eq!(&*a.value, b"ee");
514604
assert!(attributes.next().is_none());
515605
}
606+
607+
#[test]
608+
fn attribute_value_normalization() {
609+
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
610+
assert_eq!(
611+
normalize_attribute_value(Cow::Borrowed(b"\rfoo\rbar\tbaz\ndelta\n")).as_ref(),
612+
b"foo bar baz delta"
613+
);
614+
// leading and trailing spaces must be stripped
615+
assert_eq!(
616+
normalize_attribute_value(Cow::Borrowed(b" foo ")).as_ref(),
617+
b"foo"
618+
);
619+
// leading space
620+
assert_eq!(
621+
normalize_attribute_value(Cow::Borrowed(b" bar")).as_ref(),
622+
b"bar"
623+
);
624+
// trailing space
625+
assert_eq!(
626+
normalize_attribute_value(Cow::Borrowed(b"baz ")).as_ref(),
627+
b"baz"
628+
);
629+
// sequences of spaces must be replaced with a single space
630+
assert_eq!(
631+
normalize_attribute_value(Cow::Borrowed(b" foo bar baz ")).as_ref(),
632+
b"foo bar baz"
633+
);
634+
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
635+
assert_eq!(
636+
normalize_attribute_value(Cow::Borrowed(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r")).as_ref(),
637+
b"foo bar baz delta echo foxtrot"
638+
);
639+
}
516640
}

tests/test.rs

+55-40
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,19 @@ fn test_attributes_empty() {
3535
match r.read_event(&mut buf) {
3636
Ok(Empty(e)) => {
3737
let mut atts = e.attributes();
38+
3839
match atts.next() {
39-
Some(Ok(Attribute {
40-
key: b"att1",
41-
value: Cow::Borrowed(b"a"),
42-
})) => (),
40+
Some(Ok(a)) => {
41+
assert_eq!(b"att1", a.key);
42+
assert_eq!(b"a", a.value.as_ref());
43+
}
4344
e => panic!("Expecting att1='a' attribute, found {:?}", e),
4445
}
4546
match atts.next() {
46-
Some(Ok(Attribute {
47-
key: b"att2",
48-
value: Cow::Borrowed(b"b"),
49-
})) => (),
47+
Some(Ok(a)) => {
48+
assert_eq!(b"att2", a.key);
49+
assert_eq!(b"b", a.value.as_ref());
50+
}
5051
e => panic!("Expecting att2='b' attribute, found {:?}", e),
5152
}
5253
match atts.next() {
@@ -68,10 +69,10 @@ fn test_attribute_equal() {
6869
Ok(Empty(e)) => {
6970
let mut atts = e.attributes();
7071
match atts.next() {
71-
Some(Ok(Attribute {
72-
key: b"att1",
73-
value: Cow::Borrowed(b"a=b"),
74-
})) => (),
72+
Some(Ok(a)) => {
73+
assert_eq!(a.key, b"att1");
74+
assert_eq!(a.value.as_ref(), b"a=b");
75+
}
7576
e => panic!("Expecting att1=\"a=b\" attribute, found {:?}", e),
7677
}
7778
match atts.next() {
@@ -119,18 +120,23 @@ fn test_attributes_empty_ns() {
119120
.attributes()
120121
.map(|ar| ar.expect("Expecting attribute parsing to succeed."))
121122
// we don't care about xmlns attributes for this test
122-
.filter(|kv| !kv.key.starts_with(b"xmlns"))
123-
.map(|Attribute { key: name, value }| {
124-
let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf);
125-
(opt_ns, local_name, value)
126-
});
123+
.filter(|kv| !kv.key.starts_with(b"xmlns"));
124+
127125
match atts.next() {
128-
Some((None, b"att1", Cow::Borrowed(b"a"))) => (),
126+
Some(a) => {
127+
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
128+
assert_eq!(opt_ns, None);
129+
assert_eq!(local_name, b"att1");
130+
assert_eq!(a.value.as_ref(), b"a");
131+
}
129132
e => panic!("Expecting att1='a' attribute, found {:?}", e),
130133
}
131134
match atts.next() {
132-
Some((Some(ns), b"att2", Cow::Borrowed(b"b"))) => {
133-
assert_eq!(&ns[..], b"urn:example:r");
135+
Some(a) => {
136+
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
137+
assert_eq!(opt_ns.as_ref(), Some(&b"urn:example:r".as_ref()));
138+
assert_eq!(local_name, b"att2");
139+
assert_eq!(a.value.as_ref(), b"b");
134140
}
135141
e => panic!(
136142
"Expecting {{urn:example:r}}att2='b' attribute, found {:?}",
@@ -164,18 +170,23 @@ fn test_attributes_empty_ns_expanded() {
164170
.attributes()
165171
.map(|ar| ar.expect("Expecting attribute parsing to succeed."))
166172
// we don't care about xmlns attributes for this test
167-
.filter(|kv| !kv.key.starts_with(b"xmlns"))
168-
.map(|Attribute { key: name, value }| {
169-
let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf);
170-
(opt_ns, local_name, value)
171-
});
173+
.filter(|kv| !kv.key.starts_with(b"xmlns"));
174+
172175
match atts.next() {
173-
Some((None, b"att1", Cow::Borrowed(b"a"))) => (),
176+
Some(a) => {
177+
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
178+
assert_eq!(opt_ns, None);
179+
assert_eq!(local_name, b"att1");
180+
assert_eq!(a.value.as_ref(), b"a");
181+
}
174182
e => panic!("Expecting att1='a' attribute, found {:?}", e),
175183
}
176184
match atts.next() {
177-
Some((Some(ns), b"att2", Cow::Borrowed(b"b"))) => {
178-
assert_eq!(&ns[..], b"urn:example:r");
185+
Some(a) => {
186+
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
187+
assert_eq!(opt_ns.as_ref(), Some(&b"urn:example:r".as_ref()));
188+
assert_eq!(local_name, b"att2");
189+
assert_eq!(a.value.as_ref(), b"b");
179190
}
180191
e => panic!(
181192
"Expecting {{urn:example:r}}att2='b' attribute, found {:?}",
@@ -229,15 +240,17 @@ fn test_default_ns_shadowing_empty() {
229240
.attributes()
230241
.map(|ar| ar.expect("Expecting attribute parsing to succeed."))
231242
// we don't care about xmlns attributes for this test
232-
.filter(|kv| !kv.key.starts_with(b"xmlns"))
233-
.map(|Attribute { key: name, value }| {
234-
let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf);
235-
(opt_ns, local_name, value)
236-
});
243+
.filter(|kv| !kv.key.starts_with(b"xmlns"));
244+
237245
// the attribute should _not_ have a namespace name. The default namespace does not
238246
// apply to attributes.
239247
match atts.next() {
240-
Some((None, b"att1", Cow::Borrowed(b"a"))) => (),
248+
Some(a) => {
249+
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
250+
assert_eq!(opt_ns, None);
251+
assert_eq!(local_name, b"att1");
252+
assert_eq!(a.value.as_ref(), b"a")
253+
}
241254
e => panic!("Expecting att1='a' attribute, found {:?}", e),
242255
}
243256
match atts.next() {
@@ -291,15 +304,17 @@ fn test_default_ns_shadowing_expanded() {
291304
.attributes()
292305
.map(|ar| ar.expect("Expecting attribute parsing to succeed."))
293306
// we don't care about xmlns attributes for this test
294-
.filter(|kv| !kv.key.starts_with(b"xmlns"))
295-
.map(|Attribute { key: name, value }| {
296-
let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf);
297-
(opt_ns, local_name, value)
298-
});
307+
.filter(|kv| !kv.key.starts_with(b"xmlns"));
308+
299309
// the attribute should _not_ have a namespace name. The default namespace does not
300310
// apply to attributes.
301311
match atts.next() {
302-
Some((None, b"att1", Cow::Borrowed(b"a"))) => (),
312+
Some(a) => {
313+
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
314+
assert_eq!(opt_ns, None);
315+
assert_eq!(local_name, b"att1");
316+
assert_eq!(a.value.as_ref(), b"a");
317+
}
303318
e => panic!("Expecting att1='a' attribute, found {:?}", e),
304319
}
305320
match atts.next() {

0 commit comments

Comments
 (0)