Skip to content

Commit 04faab3

Browse files
committed
Properly normalize attribute values
closes tafia#371
1 parent e701c4d commit 04faab3

File tree

3 files changed

+121
-1
lines changed

3 files changed

+121
-1
lines changed

src/errors.rs

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ impl From<EscapeError> for Error {
6262
}
6363

6464
impl From<AttrError> for Error {
65+
/// Creates a new `Error::InvalidAttr` from the given error
6566
#[inline]
6667
fn from(error: AttrError) -> Self {
6768
Error::InvalidAttr(error)

src/escapei.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
134134
}
135135

136136
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
137-
/// value, using a dictionnary of custom entities.
137+
/// value, using a dictionary of custom entities.
138138
///
139139
/// # Pre-condition
140140
///

src/events/attributes.rs

+119
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ pub struct Attribute<'a> {
3232
}
3333

3434
impl<'a> Attribute<'a> {
35+
pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>> {
36+
let normalized = normalize_attribute_value(&*self.value);
37+
let escaped = do_unescape(&*normalized, None).map_err(Error::EscapeError)?;
38+
Ok(Cow::Owned(escaped.into_owned()))
39+
}
40+
3541
/// Returns the unescaped value.
3642
///
3743
/// This is normally the value you are interested in. Escape sequences such as `&gt;` are
@@ -289,6 +295,92 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
289295
}
290296
}
291297

298+
/// Normalize the attribute value according to xml specification section 3.3.3
299+
///
300+
/// https://www.w3.org/TR/xml/#AVNormalize
301+
///
302+
/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
303+
/// * Sequences of whitespace-like characters are replaced with a single whitespace character
304+
/// * Character and entity references are substituted as defined by the spec
305+
fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
306+
// TODO: character references, entity references, error handling associated with those
307+
308+
#[derive(PartialEq)]
309+
enum ParseState {
310+
Space,
311+
CDATA,
312+
}
313+
314+
// Trim characters from the beginning and end of the attribute value - this can't fail.
315+
fn trim_value(attr: &[u8]) -> &[u8] {
316+
let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' ');
317+
318+
let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c));
319+
320+
if first_non_space_char.is_none() {
321+
// The entire value was whitespace-like characters
322+
return b"";
323+
}
324+
325+
let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c));
326+
327+
// Trim all whitespace-like characters away from the beginning and end of the attribute value.
328+
let begin = first_non_space_char.unwrap();
329+
let end = last_non_space_char.unwrap_or(attr.len());
330+
&attr[begin..=end]
331+
}
332+
333+
let trimmed_attr = trim_value(attr);
334+
335+
// A new buffer is only created when we encounter a situation that requires it.
336+
let mut normalized: Option<Vec<u8>> = None;
337+
// We start on character data because all whitespace-like characters are already trimmed away.
338+
let mut current_state = ParseState::CDATA;
339+
340+
// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
341+
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
342+
// buffer and continue using this buffer.
343+
for (idx, ch) in trimmed_attr.iter().enumerate() {
344+
match ch {
345+
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
346+
ParseState::Space => match normalized {
347+
Some(_) => continue,
348+
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
349+
},
350+
ParseState::CDATA => {
351+
current_state = ParseState::Space;
352+
match normalized.as_mut() {
353+
Some(buf) => buf.push(b' '),
354+
None => {
355+
let mut buf = Vec::from(&trimmed_attr[..idx]);
356+
buf.push(b' ');
357+
normalized = Some(buf);
358+
}
359+
}
360+
}
361+
},
362+
c @ _ => match current_state {
363+
ParseState::Space => {
364+
current_state = ParseState::CDATA;
365+
if let Some(normalized) = normalized.as_mut() {
366+
normalized.push(*c);
367+
}
368+
}
369+
ParseState::CDATA => {
370+
if let Some(normalized) = normalized.as_mut() {
371+
normalized.push(*c);
372+
}
373+
}
374+
},
375+
}
376+
}
377+
378+
match normalized {
379+
Some(normalized) => Cow::Owned(normalized),
380+
None => Cow::Borrowed(trimmed_attr),
381+
}
382+
}
383+
292384
////////////////////////////////////////////////////////////////////////////////////////////////////
293385

294386
/// Iterator over XML attributes.
@@ -2353,4 +2445,31 @@ mod html {
23532445
assert_eq!(iter.next(), None);
23542446
assert_eq!(iter.next(), None);
23552447
}
2448+
2449+
#[test]
2450+
fn attribute_value_normalization() {
2451+
// empty value
2452+
assert_eq!(normalize_attribute_value(b"").as_ref(), b"");
2453+
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
2454+
assert_eq!(
2455+
normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(),
2456+
b"foo bar baz delta"
2457+
);
2458+
// leading and trailing spaces must be stripped
2459+
assert_eq!(normalize_attribute_value(b" foo ").as_ref(), b"foo");
2460+
// leading space
2461+
assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar");
2462+
// trailing space
2463+
assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz");
2464+
// sequences of spaces must be replaced with a single space
2465+
assert_eq!(
2466+
normalize_attribute_value(b" foo bar baz ").as_ref(),
2467+
b"foo bar baz"
2468+
);
2469+
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
2470+
assert_eq!(
2471+
normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(),
2472+
b"foo bar baz delta echo foxtrot"
2473+
);
2474+
}
23562475
}

0 commit comments

Comments
 (0)