4
4
5
5
use crate :: errors:: Result as XmlResult ;
6
6
use crate :: escape:: { escape, unescape_with} ;
7
+ use crate :: escapei:: { self , EscapeError } ;
7
8
use crate :: name:: QName ;
8
9
use crate :: reader:: { is_whitespace, Reader } ;
9
10
use crate :: utils:: { write_byte_string, write_cow_string, Bytes } ;
@@ -30,7 +31,84 @@ pub struct Attribute<'a> {
30
31
}
31
32
32
33
impl < ' a > Attribute < ' a > {
33
- /// Decodes using UTF-8 then unescapes the value.
34
+ /// Returns the attribute value normalized as per the XML specification.
35
+ ///
36
+ /// https://www.w3.org/TR/xml/#AVNormalize
37
+ ///
38
+ /// Do not use this method with HTML attributes.
39
+ ///
40
+ /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>`
41
+ /// and the characters \t, \r, \n are replaced with whitespace characters.
42
+ ///
43
+ /// This will allocate unless the raw attribute value does not require normalization.
44
+ ///
45
+ /// See also [`normalized_value_with_custom_entities()`](#method.normalized_value_with_custom_entities)
46
+ pub fn normalized_value ( & ' a self ) -> Result < Cow < ' a , str > , EscapeError > {
47
+ self . normalized_value_with ( |_| None )
48
+ }
49
+
50
+ /// Returns the attribute value normalized as per the XML specification, using custom entities.
51
+ ///
52
+ /// https://www.w3.org/TR/xml/#AVNormalize
53
+ ///
54
+ /// Do not use this method with HTML attributes.
55
+ ///
56
+ /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>`
57
+ /// and the characters \t, \r, \n are replaced with whitespace characters.
58
+ /// Additional entities can be provided in `custom_entities`.
59
+ ///
60
+ /// This will allocate unless the raw attribute value does not require normalization.
61
+ ///
62
+ /// See also [`normalized_value()`](#method.normalized_value)
63
+ ///
64
+ /// # Pre-condition
65
+ ///
66
+ /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
67
+ pub fn normalized_value_with < ' entity > (
68
+ & ' a self ,
69
+ resolve_entity : impl Fn ( & str ) -> Option < & ' entity str > ,
70
+ ) -> Result < Cow < ' a , str > , EscapeError > {
71
+ // TODO: avoid allocation when not needed
72
+ let mut normalized: Vec < u8 > = Vec :: with_capacity ( self . value . len ( ) ) ;
73
+
74
+ let attr = self . value . as_ref ( ) ;
75
+ let mut attr_iter = attr. iter ( ) . enumerate ( ) ;
76
+
77
+ while let Some ( ( idx, ch) ) = attr_iter. next ( ) {
78
+ match ch {
79
+ b' ' | b'\n' | b'\r' | b'\t' => normalized. push ( b' ' ) ,
80
+ b'&' => {
81
+ let end = idx
82
+ + 1
83
+ + attr_iter
84
+ . position ( |( _, c) | * c == b';' )
85
+ . ok_or_else ( || EscapeError :: UnterminatedEntity ( idx..attr. len ( ) ) ) ?;
86
+ let entity = & attr[ idx + 1 ..end] ; // starts after the &
87
+
88
+ if let Some ( s) = escapei:: named_entity ( entity) {
89
+ normalized. extend_from_slice ( s. as_bytes ( ) ) ;
90
+ } else if entity. starts_with ( b"#" ) {
91
+ let entity = & entity[ 1 ..] ; // starts after the #
92
+ let codepoint = escapei:: parse_number ( entity, idx..end) ?;
93
+ escapei:: push_utf8 ( & mut normalized, codepoint) ;
94
+ } else if let Some ( value) = custom_entities. and_then ( |hm| hm. get ( entity) ) {
95
+ // TODO: recursively apply entity substitution
96
+ normalized. extend_from_slice ( & value) ;
97
+ } else {
98
+ return Err ( EscapeError :: UnrecognizedSymbol (
99
+ idx + 1 ..end,
100
+ String :: from_utf8 ( entity. to_vec ( ) ) ,
101
+ ) ) ;
102
+ }
103
+ }
104
+ _ => normalized. push ( * ch) ,
105
+ }
106
+ }
107
+
108
+ Ok ( Cow :: Owned ( normalized) )
109
+ }
110
+
111
+ /// Returns the unescaped value.
34
112
///
35
113
/// This is normally the value you are interested in. Escape sequences such as `>` are
36
114
/// replaced with their unescaped equivalents such as `>`.
@@ -791,6 +869,57 @@ mod xml {
791
869
use super :: * ;
792
870
use pretty_assertions:: assert_eq;
793
871
872
+ #[ test]
873
+ fn attribute_value_normalization ( ) {
874
+ // empty value
875
+ let raw_value = "" . as_bytes ( ) ;
876
+ let output = "" . as_bytes ( ) . to_vec ( ) ;
877
+ let attr = Attribute :: from ( ( "foo" . as_bytes ( ) , raw_value) ) ;
878
+ assert_eq ! ( attr. normalized_value( ) , Ok ( Cow :: Owned :: <[ u8 ] >( output) ) ) ;
879
+
880
+ // return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with a space character
881
+ let raw_value = "\r \n foo\r bar\t baz\n \n delta\n " . as_bytes ( ) ;
882
+ let output = " foo bar baz delta " . as_bytes ( ) . to_vec ( ) ;
883
+ let attr = Attribute :: from ( ( "foo" . as_bytes ( ) , raw_value) ) ;
884
+ assert_eq ! ( attr. normalized_value( ) , Ok ( Cow :: Owned :: <[ u8 ] >( output) ) ) ;
885
+
886
+ // entities must be terminated
887
+ let raw_value = "abc"def" . as_bytes ( ) ;
888
+ let attr = Attribute :: from ( ( "foo" . as_bytes ( ) , raw_value) ) ;
889
+ assert_eq ! (
890
+ attr. normalized_value( ) ,
891
+ Err ( EscapeError :: UnterminatedEntity ( 3 ..11 ) )
892
+ ) ;
893
+
894
+ // unknown entities raise error
895
+ let raw_value = "abc&unkn;def" . as_bytes ( ) ;
896
+ let attr = Attribute :: from ( ( "foo" . as_bytes ( ) , raw_value) ) ;
897
+ assert_eq ! (
898
+ attr. normalized_value( ) ,
899
+ Err ( EscapeError :: UnrecognizedSymbol ( 4 ..8 , Ok ( "unkn" . to_owned( ) ) ) ) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate. shared with unescape code
900
+ ) ;
901
+
902
+ // // custom entity replacement works, entity replacement text processed recursively
903
+ // let raw_value = "&d;&d;A&a; &a;B&da;".as_bytes();
904
+ // let output = b" A B ".to_vec();
905
+ // let attr = Attribute::from(("foo".as_bytes(), raw_value));
906
+ // let mut custom_entities = HashMap::new();
907
+ // custom_entities.insert(b"d".to_vec(), b"
".to_vec());
908
+ // custom_entities.insert(b"a".to_vec(), b"
".to_vec());
909
+ // custom_entities.insert(b"da".to_vec(), b"
".to_vec());
910
+ // dbg!(std::str::from_utf8(attr.normalized_value_with_custom_entities(&custom_entities).unwrap().as_ref()).unwrap());
911
+ // assert_eq!(
912
+ // attr.normalized_value_with_custom_entities(&custom_entities),
913
+ // Ok(Cow::Owned::<[u8]>(output))
914
+ // );
915
+
916
+ // character literal references are substituted without being replaced by spaces
917
+ let raw_value = "

A

B
" . as_bytes ( ) ;
918
+ let output = "\r \r A\n \n B\r \n " . as_bytes ( ) . to_vec ( ) ;
919
+ let attr = Attribute :: from ( ( "foo" . as_bytes ( ) , raw_value) ) ;
920
+ assert_eq ! ( attr. normalized_value( ) , Ok ( Cow :: Owned :: <[ u8 ] >( output) ) ) ;
921
+ }
922
+
794
923
/// Checked attribute is the single attribute
795
924
mod single {
796
925
use super :: * ;
0 commit comments