|
4 | 4 |
|
5 | 5 | use crate::errors::{Error, Result as XmlResult};
|
6 | 6 | use crate::escape::{do_unescape, escape};
|
| 7 | +use crate::escapei::EscapeError; |
7 | 8 | use crate::name::QName;
|
8 | 9 | use crate::reader::{is_whitespace, Reader};
|
9 | 10 | use crate::utils::{write_byte_string, write_cow_string, Bytes};
|
@@ -32,6 +33,13 @@ pub struct Attribute<'a> {
|
32 | 33 | }
|
33 | 34 |
|
34 | 35 | impl<'a> Attribute<'a> {
|
| 36 | + /// |
| 37 | + pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>, EscapeError> { |
| 38 | + let normalized = normalize_attribute_value(self.value.as_ref()); |
| 39 | + let escaped = do_unescape(&*normalized, None)?; |
| 40 | + Ok(Cow::Owned(escaped.into_owned())) |
| 41 | + } |
| 42 | + |
35 | 43 | /// Returns the unescaped value.
|
36 | 44 | ///
|
37 | 45 | /// This is normally the value you are interested in. Escape sequences such as `>` are
|
@@ -289,6 +297,90 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
|
289 | 297 | }
|
290 | 298 | }
|
291 | 299 |
|
| 300 | +/// Normalize the attribute value according to xml specification section 3.3.3 |
| 301 | +/// |
| 302 | +/// https://www.w3.org/TR/xml/#AVNormalize |
| 303 | +/// |
| 304 | +/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value |
| 305 | +/// * Sequences of whitespace-like characters are replaced with a single whitespace character |
| 306 | +/// * Character and entity references are substituted as defined by the spec |
| 307 | +fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> { |
| 308 | + // TODO: character references, entity references, error handling associated with those |
| 309 | + |
| 310 | + #[derive(PartialEq)] |
| 311 | + enum ParseState { |
| 312 | + Space, |
| 313 | + CDATA, |
| 314 | + } |
| 315 | + |
| 316 | + // Trim characters from the beginning and end of the attribute value - this can't fail. |
| 317 | + fn trim_value(attr: &[u8]) -> &[u8] { |
| 318 | + let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c)); |
| 319 | + |
| 320 | + if first_non_space_char.is_none() { |
| 321 | + // The entire value was whitespace-like characters |
| 322 | + return b""; |
| 323 | + } |
| 324 | + |
| 325 | + let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c)); |
| 326 | + |
| 327 | + // Trim all whitespace-like characters away from the beginning and end of the attribute value. |
| 328 | + let begin = first_non_space_char.unwrap(); |
| 329 | + let end = last_non_space_char.unwrap_or(attr.len()); |
| 330 | + &attr[begin..=end] |
| 331 | + } |
| 332 | + |
| 333 | + let trimmed_attr = trim_value(attr); |
| 334 | + |
| 335 | + // A new buffer is only created when we encounter a situation that requires it. |
| 336 | + let mut normalized: Option<Vec<u8>> = None; |
| 337 | + // We start on character data because all whitespace-like characters are already trimmed away. |
| 338 | + let mut current_state = ParseState::CDATA; |
| 339 | + |
| 340 | + // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference |
| 341 | + // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new |
| 342 | + // buffer and continue using this buffer. |
| 343 | + for (idx, ch) in trimmed_attr.iter().enumerate() { |
| 344 | + match ch { |
| 345 | + b'\n' | b'\r' | b'\t' | b' ' => match current_state { |
| 346 | + ParseState::Space => match normalized { |
| 347 | + Some(_) => continue, |
| 348 | + None => normalized = Some(Vec::from(&trimmed_attr[..idx])), |
| 349 | + }, |
| 350 | + ParseState::CDATA => { |
| 351 | + current_state = ParseState::Space; |
| 352 | + match normalized.as_mut() { |
| 353 | + Some(buf) => buf.push(b' '), |
| 354 | + None => { |
| 355 | + let mut buf = Vec::from(&trimmed_attr[..idx]); |
| 356 | + buf.push(b' '); |
| 357 | + normalized = Some(buf); |
| 358 | + } |
| 359 | + } |
| 360 | + } |
| 361 | + }, |
| 362 | + c @ _ => match current_state { |
| 363 | + ParseState::Space => { |
| 364 | + current_state = ParseState::CDATA; |
| 365 | + if let Some(normalized) = normalized.as_mut() { |
| 366 | + normalized.push(*c); |
| 367 | + } |
| 368 | + } |
| 369 | + ParseState::CDATA => { |
| 370 | + if let Some(normalized) = normalized.as_mut() { |
| 371 | + normalized.push(*c); |
| 372 | + } |
| 373 | + } |
| 374 | + }, |
| 375 | + } |
| 376 | + } |
| 377 | + |
| 378 | + match normalized { |
| 379 | + Some(normalized) => Cow::Owned(normalized), |
| 380 | + None => Cow::Borrowed(trimmed_attr), |
| 381 | + } |
| 382 | +} |
| 383 | + |
292 | 384 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
293 | 385 |
|
294 | 386 | /// Iterator over XML attributes.
|
@@ -905,6 +997,33 @@ mod xml {
|
905 | 997 | use super::*;
|
906 | 998 | use pretty_assertions::assert_eq;
|
907 | 999 |
|
| 1000 | + #[test] |
| 1001 | + fn attribute_value_normalization() { |
| 1002 | + // empty value |
| 1003 | + assert_eq!(normalize_attribute_value(b""), Cow::Borrowed(b"")); |
| 1004 | + // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character |
| 1005 | + assert_eq!( |
| 1006 | + normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n"), |
| 1007 | + Cow::Owned::<[u8]>(b"foo bar baz delta".to_vec()) |
| 1008 | + ); |
| 1009 | + // leading and trailing spaces must be stripped |
| 1010 | + assert_eq!(normalize_attribute_value(b" foo "), Cow::Borrowed(b"foo")); |
| 1011 | + // leading space |
| 1012 | + assert_eq!(normalize_attribute_value(b" bar"), Cow::Borrowed(b"bar")); |
| 1013 | + // trailing space |
| 1014 | + assert_eq!(normalize_attribute_value(b"baz "), Cow::Borrowed(b"baz")); |
| 1015 | + // sequences of spaces must be replaced with a single space |
| 1016 | + assert_eq!( |
| 1017 | + normalize_attribute_value(b" foo bar baz "), |
| 1018 | + Cow::Owned::<[u8]>(b"foo bar baz".to_vec()) |
| 1019 | + ); |
| 1020 | + // sequence replacement mixed with characters treated as whitespace (\t \r \n) |
| 1021 | + assert_eq!( |
| 1022 | + normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"), |
| 1023 | + Cow::Owned::<[u8]>(b"foo bar baz delta echo foxtrot".to_vec()) |
| 1024 | + ); |
| 1025 | + } |
| 1026 | + |
908 | 1027 | /// Checked attribute is the single attribute
|
909 | 1028 | mod single {
|
910 | 1029 | use super::*;
|
|
0 commit comments