@@ -32,6 +32,12 @@ pub struct Attribute<'a> {
32
32
}
33
33
34
34
impl < ' a > Attribute < ' a > {
35
+ pub fn normalized_value ( & ' a self ) -> Result < Cow < ' a , [ u8 ] > > {
36
+ let normalized = normalize_attribute_value ( & * self . value ) ;
37
+ let escaped = do_unescape ( & * normalized, None ) . map_err ( Error :: EscapeError ) ?;
38
+ Ok ( Cow :: Owned ( escaped. into_owned ( ) ) )
39
+ }
40
+
35
41
/// Returns the unescaped value.
36
42
///
37
43
/// This is normally the value you are interested in. Escape sequences such as `>` are
@@ -289,6 +295,92 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
289
295
}
290
296
}
291
297
298
+ /// Normalize the attribute value according to xml specification section 3.3.3
299
+ ///
300
+ /// https://www.w3.org/TR/xml/#AVNormalize
301
+ ///
302
+ /// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
303
+ /// * Sequences of whitespace-like characters are replaced with a single whitespace character
304
+ /// * Character and entity references are substituted as defined by the spec
305
+ fn normalize_attribute_value ( attr : & [ u8 ] ) -> Cow < [ u8 ] > {
306
+ // TODO: character references, entity references, error handling associated with those
307
+
308
+ #[ derive( PartialEq ) ]
309
+ enum ParseState {
310
+ Space ,
311
+ CDATA ,
312
+ }
313
+
314
+ // Trim characters from the beginning and end of the attribute value - this can't fail.
315
+ fn trim_value ( attr : & [ u8 ] ) -> & [ u8 ] {
316
+ let is_whitespace_like = |c| matches ! ( c, b'\n' | b'\r' | b'\t' | b' ' ) ;
317
+
318
+ let first_non_space_char = attr. iter ( ) . position ( |c| !is_whitespace_like ( * c) ) ;
319
+
320
+ if first_non_space_char. is_none ( ) {
321
+ // The entire value was whitespace-like characters
322
+ return b"" ;
323
+ }
324
+
325
+ let last_non_space_char = attr. iter ( ) . rposition ( |c| !is_whitespace_like ( * c) ) ;
326
+
327
+ // Trim all whitespace-like characters away from the beginning and end of the attribute value.
328
+ let begin = first_non_space_char. unwrap ( ) ;
329
+ let end = last_non_space_char. unwrap_or ( attr. len ( ) ) ;
330
+ & attr[ begin..=end]
331
+ }
332
+
333
+ let trimmed_attr = trim_value ( attr) ;
334
+
335
+ // A new buffer is only created when we encounter a situation that requires it.
336
+ let mut normalized: Option < Vec < u8 > > = None ;
337
+ // We start on character data because all whitespace-like characters are already trimmed away.
338
+ let mut current_state = ParseState :: CDATA ;
339
+
340
+ // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
341
+ // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
342
+ // buffer and continue using this buffer.
343
+ for ( idx, ch) in trimmed_attr. iter ( ) . enumerate ( ) {
344
+ match ch {
345
+ b'\n' | b'\r' | b'\t' | b' ' => match current_state {
346
+ ParseState :: Space => match normalized {
347
+ Some ( _) => continue ,
348
+ None => normalized = Some ( Vec :: from ( & trimmed_attr[ ..idx] ) ) ,
349
+ } ,
350
+ ParseState :: CDATA => {
351
+ current_state = ParseState :: Space ;
352
+ match normalized. as_mut ( ) {
353
+ Some ( buf) => buf. push ( b' ' ) ,
354
+ None => {
355
+ let mut buf = Vec :: from ( & trimmed_attr[ ..idx] ) ;
356
+ buf. push ( b' ' ) ;
357
+ normalized = Some ( buf) ;
358
+ }
359
+ }
360
+ }
361
+ } ,
362
+ c @ _ => match current_state {
363
+ ParseState :: Space => {
364
+ current_state = ParseState :: CDATA ;
365
+ if let Some ( normalized) = normalized. as_mut ( ) {
366
+ normalized. push ( * c) ;
367
+ }
368
+ }
369
+ ParseState :: CDATA => {
370
+ if let Some ( normalized) = normalized. as_mut ( ) {
371
+ normalized. push ( * c) ;
372
+ }
373
+ }
374
+ } ,
375
+ }
376
+ }
377
+
378
+ match normalized {
379
+ Some ( normalized) => Cow :: Owned ( normalized) ,
380
+ None => Cow :: Borrowed ( trimmed_attr) ,
381
+ }
382
+ }
383
+
292
384
////////////////////////////////////////////////////////////////////////////////////////////////////
293
385
294
386
/// Iterator over XML attributes.
@@ -2353,4 +2445,31 @@ mod html {
2353
2445
assert_eq ! ( iter. next( ) , None ) ;
2354
2446
assert_eq ! ( iter. next( ) , None ) ;
2355
2447
}
2448
+
2449
+ #[ test]
2450
+ fn attribute_value_normalization ( ) {
2451
+ // empty value
2452
+ assert_eq ! ( normalize_attribute_value( b"" ) . as_ref( ) , b"" ) ;
2453
+ // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
2454
+ assert_eq ! (
2455
+ normalize_attribute_value( b"\r foo\r bar\t baz\n delta\n " ) . as_ref( ) ,
2456
+ b"foo bar baz delta"
2457
+ ) ;
2458
+ // leading and trailing spaces must be stripped
2459
+ assert_eq ! ( normalize_attribute_value( b" foo " ) . as_ref( ) , b"foo" ) ;
2460
+ // leading space
2461
+ assert_eq ! ( normalize_attribute_value( b" bar" ) . as_ref( ) , b"bar" ) ;
2462
+ // trailing space
2463
+ assert_eq ! ( normalize_attribute_value( b"baz " ) . as_ref( ) , b"baz" ) ;
2464
+ // sequences of spaces must be replaced with a single space
2465
+ assert_eq ! (
2466
+ normalize_attribute_value( b" foo bar baz " ) . as_ref( ) ,
2467
+ b"foo bar baz"
2468
+ ) ;
2469
+ // sequence replacement mixed with characters treated as whitespace (\t \r \n)
2470
+ assert_eq ! (
2471
+ normalize_attribute_value( b" \t foo\t bar \r baz \n \n delta\n \t \r echo foxtrot\r " ) . as_ref( ) ,
2472
+ b"foo bar baz delta echo foxtrot"
2473
+ ) ;
2474
+ }
2356
2475
}
0 commit comments