@@ -37,8 +37,9 @@ use std::fmt;
37
37
use std:: slice;
38
38
use std:: str;
39
39
40
- /// Represents a set of characters / bytes that should be percent-encoded .
40
+ /// Represents a set of characters or bytes in the ASCII range .
41
41
///
42
+ /// This is use to represent which characters should be pecent-encoded.
42
43
/// See [encode sets specification](http://url.spec.whatwg.org/#simple-encode-set).
43
44
///
44
45
/// Different characters need to be encoded in different parts of an URL.
@@ -50,116 +51,81 @@ use std::str;
50
51
///
51
52
/// A few sets are defined in this module.
52
53
/// Use the [`define_encode_set!`](../macro.define_encode_set!.html) macro to define different ones.
53
- pub trait EncodeSet : Clone {
54
- /// Called with UTF-8 bytes rather than code points.
55
- /// Should return true for all non-ASCII bytes.
56
- fn contains ( & self , byte : u8 ) -> bool ;
54
+ pub struct AsciiSet {
55
+ mask : [ Chunk ; 0x80 / BITS_PER_CHUNK ] ,
57
56
}
58
57
59
- /// Define a new struct
60
- /// that implements the [`EncodeSet`](percent_encoding/trait.EncodeSet.html) trait,
61
- /// for use in [`percent_decode()`](percent_encoding/fn.percent_encode.html)
62
- /// and related functions.
63
- ///
64
- /// Parameters are characters to include in the set in addition to those of the base set.
65
- /// See [encode sets specification](http://url.spec.whatwg.org/#simple-encode-set).
66
- ///
67
- /// Example
68
- /// =======
69
- ///
70
- /// ```rust
71
- /// #[macro_use] extern crate percent_encoding;
72
- /// use percent_encoding::{utf8_percent_encode, SIMPLE_ENCODE_SET};
73
- /// define_encode_set! {
74
- /// /// This encode set is used in the URL parser for query strings.
75
- /// pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'}
76
- /// }
77
- /// # fn main() {
78
- /// assert_eq!(utf8_percent_encode("foo bar", QUERY_ENCODE_SET).collect::<String>(), "foo%20bar");
79
- /// # }
80
- /// ```
81
- #[ macro_export]
82
- macro_rules! define_encode_set {
83
- ( $( #[ $attr: meta] ) * pub $name: ident = [ $base_set: expr] | { $( $ch: pat) ,* } ) => {
84
- $( #[ $attr] ) *
85
- #[ derive( Copy , Clone , Debug ) ]
86
- #[ allow( non_camel_case_types) ]
87
- pub struct $name;
88
-
89
- impl $crate:: EncodeSet for $name {
90
- #[ inline]
91
- fn contains( & self , byte: u8 ) -> bool {
92
- match byte as char {
93
- $(
94
- $ch => true ,
95
- ) *
96
- _ => $base_set. contains( byte)
97
- }
98
- }
99
- }
58
+ type Chunk = u32 ;
59
+
60
+ const BITS_PER_CHUNK : usize = 8 * std:: mem:: size_of :: < Chunk > ( ) ;
61
+
62
+ impl AsciiSet {
63
+ /// Called with UTF-8 bytes rather than code points.
64
+ /// Not used for non-ASCII bytes.
65
+ const fn contains ( & self , byte : u8 ) -> bool {
66
+ let chunk = self . mask [ byte as usize / BITS_PER_CHUNK ] ;
67
+ let mask = 1 << ( byte as usize % BITS_PER_CHUNK ) ;
68
+ ( chunk & mask) != 0
100
69
}
101
- }
102
70
103
- /// This encode set is used for the path of cannot-be-a-base URLs.
104
- ///
105
- /// All ASCII charcters less than hexidecimal 20 and greater than 7E are encoded. This includes
106
- /// special charcters such as line feed, carriage return, NULL, etc.
107
- #[ derive( Copy , Clone , Debug ) ]
108
- #[ allow( non_camel_case_types) ]
109
- pub struct SIMPLE_ENCODE_SET ;
110
-
111
- impl EncodeSet for SIMPLE_ENCODE_SET {
112
- #[ inline]
113
- fn contains ( & self , byte : u8 ) -> bool {
114
- byte < 0x20 || byte > 0x7E
71
+ fn should_percent_encode ( & self , byte : u8 ) -> bool {
72
+ !byte. is_ascii ( ) || self . contains ( byte)
115
73
}
116
- }
117
74
118
- define_encode_set ! {
119
- /// This encode set is used in the URL parser for query strings.
120
- ///
121
- /// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html),
122
- /// space, double quote ("), hash (#), and inequality qualifiers (<), (>) are encoded.
123
- pub QUERY_ENCODE_SET = [ SIMPLE_ENCODE_SET ] | { ' ' , '"' , '#' , '<' , '>' }
75
+ pub const fn add ( & self , byte : u8 ) -> Self {
76
+ let mut mask = self . mask ;
77
+ mask[ byte as usize / BITS_PER_CHUNK ] |= 1 << ( byte as usize % BITS_PER_CHUNK ) ;
78
+ AsciiSet { mask }
79
+ }
124
80
}
125
81
126
- define_encode_set ! {
127
- /// This encode set is used for path components.
128
- ///
129
- /// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html),
130
- /// space, double quote ("), hash (#), inequality qualifiers (<), (>), backtick (`),
131
- /// question mark (?), and curly brackets ({), (}) are encoded.
132
- pub DEFAULT_ENCODE_SET = [ QUERY_ENCODE_SET ] | { '`' , '?' , '{' , '}' }
133
- }
82
+ /// https://url.spec.whatwg.org/#c0-control-percent-encode-set
83
+ pub const C0_CONTROL : & AsciiSet = & AsciiSet {
84
+ mask : [
85
+ !0_u32 , // C0: 0x00 to 0x1F (32 bits set)
86
+ 0 ,
87
+ 0 ,
88
+ 1 << ( 0x7F_u32 % 32 ) , // DEL: 0x7F (one bit set)
89
+ ] ,
90
+ } ;
134
91
135
- define_encode_set ! {
136
- /// This encode set is used for on '/'-separated path segment
137
- ///
138
- /// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html),
139
- /// space, double quote ("), hash (#), inequality qualifiers (<), (>), backtick (`),
140
- /// question mark (?), and curly brackets ({), (}), percent sign (%), forward slash (/) are
141
- /// encoded.
142
- ///
143
- /// # Note
144
- ///
145
- /// For [special URLs](https://url.spec.whatwg.org/#is-special), the backslash (\) character should
146
- /// additionally be escaped, but that is *not* included in this encode set.
147
- pub PATH_SEGMENT_ENCODE_SET = [ DEFAULT_ENCODE_SET ] | { '%' , '/' }
148
- }
92
+ /// https://url.spec.whatwg.org/#fragment-percent-encode-set
93
+ pub const FRAGMENT : & AsciiSet = & C0_CONTROL . add ( b' ' ) . add ( b'"' ) . add ( b'<' ) . add ( b'>' ) . add ( b'`' ) ;
149
94
150
- define_encode_set ! {
151
- /// This encode set is used for username and password.
152
- ///
153
- /// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html),
154
- /// space, double quote ("), hash (#), inequality qualifiers (<), (>), backtick (`),
155
- /// question mark (?), and curly brackets ({), (}), forward slash (/), colon (:), semi-colon (;),
156
- /// equality (=), at (@), backslash (\\), square brackets ([), (]), caret (\^), and pipe (|) are
157
- /// encoded.
158
- pub USERINFO_ENCODE_SET = [ DEFAULT_ENCODE_SET ] | {
159
- '/' , ':' , ';' , '=' , '@' , '[' , '\\' , ']' , '^' , '|'
95
+ /// https://url.spec.whatwg.org/#path-percent-encode-set
96
+ pub const PATH : & AsciiSet = & FRAGMENT . add ( b'#' ) . add ( b'?' ) . add ( b'{' ) . add ( b'}' ) ;
97
+
98
+ /// https://url.spec.whatwg.org/#userinfo-percent-encode-set
99
+ pub const USERINFO : & AsciiSet = & PATH
100
+ . add ( b'/' )
101
+ . add ( b':' )
102
+ . add ( b';' )
103
+ . add ( b'=' )
104
+ . add ( b'@' )
105
+ . add ( b'[' )
106
+ . add ( b'\\' )
107
+ . add ( b']' )
108
+ . add ( b'^' )
109
+ . add ( b'|' ) ;
110
+
111
+ macro_rules! static_assert {
112
+ ( $( $bool: expr, ) +) => {
113
+ fn _static_assert( ) {
114
+ $(
115
+ let _ = std:: mem:: transmute:: <[ u8 ; $bool as usize ] , u8 >;
116
+ ) +
117
+ }
160
118
}
161
119
}
162
120
121
+ static_assert ! {
122
+ C0_CONTROL . contains( 0x00 ) ,
123
+ C0_CONTROL . contains( 0x1F ) ,
124
+ !C0_CONTROL . contains( 0x20 ) ,
125
+ !C0_CONTROL . contains( 0x7E ) ,
126
+ C0_CONTROL . contains( 0x7F ) ,
127
+ }
128
+
163
129
/// Return the percent-encoding of the given bytes.
164
130
///
165
131
/// This is unconditional, unlike `percent_encode()` which uses an encode set.
@@ -214,10 +180,10 @@ pub fn percent_encode_byte(byte: u8) -> &'static str {
214
180
/// assert_eq!(percent_encode(b"foo bar?", DEFAULT_ENCODE_SET).to_string(), "foo%20bar%3F");
215
181
/// ```
216
182
#[ inline]
217
- pub fn percent_encode < E : EncodeSet > ( input : & [ u8 ] , encode_set : E ) -> PercentEncode < E > {
183
+ pub fn percent_encode < ' a > ( input : & ' a [ u8 ] , encode_set : & ' static AsciiSet ) -> PercentEncode < ' a > {
218
184
PercentEncode {
219
185
bytes : input,
220
- encode_set : encode_set ,
186
+ encode_set,
221
187
}
222
188
}
223
189
@@ -233,35 +199,32 @@ pub fn percent_encode<E: EncodeSet>(input: &[u8], encode_set: E) -> PercentEncod
233
199
/// assert_eq!(utf8_percent_encode("foo bar?", DEFAULT_ENCODE_SET).to_string(), "foo%20bar%3F");
234
200
/// ```
235
201
#[ inline]
236
- pub fn utf8_percent_encode < E : EncodeSet > ( input : & str , encode_set : E ) -> PercentEncode < E > {
202
+ pub fn utf8_percent_encode < ' a > ( input : & ' a str , encode_set : & ' static AsciiSet ) -> PercentEncode < ' a > {
237
203
percent_encode ( input. as_bytes ( ) , encode_set)
238
204
}
239
205
240
206
/// The return type of `percent_encode()` and `utf8_percent_encode()`.
241
- #[ derive( Clone , Debug ) ]
242
- pub struct PercentEncode < ' a , E : EncodeSet > {
207
+ #[ derive( Clone ) ]
208
+ pub struct PercentEncode < ' a > {
243
209
bytes : & ' a [ u8 ] ,
244
- encode_set : E ,
210
+ encode_set : & ' static AsciiSet ,
245
211
}
246
212
247
- impl < ' a , E : EncodeSet > Iterator for PercentEncode < ' a , E > {
213
+ impl < ' a > Iterator for PercentEncode < ' a > {
248
214
type Item = & ' a str ;
249
215
250
216
fn next ( & mut self ) -> Option < & ' a str > {
251
217
if let Some ( ( & first_byte, remaining) ) = self . bytes . split_first ( ) {
252
- if self . encode_set . contains ( first_byte) {
218
+ if self . encode_set . should_percent_encode ( first_byte) {
253
219
self . bytes = remaining;
254
220
Some ( percent_encode_byte ( first_byte) )
255
221
} else {
256
- assert ! ( first_byte. is_ascii( ) ) ;
257
222
for ( i, & byte) in remaining. iter ( ) . enumerate ( ) {
258
- if self . encode_set . contains ( byte) {
223
+ if self . encode_set . should_percent_encode ( byte) {
259
224
// 1 for first_byte + i for previous iterations of this loop
260
225
let ( unchanged_slice, remaining) = self . bytes . split_at ( 1 + i) ;
261
226
self . bytes = remaining;
262
227
return Some ( unsafe { str:: from_utf8_unchecked ( unchanged_slice) } ) ;
263
- } else {
264
- assert ! ( byte. is_ascii( ) ) ;
265
228
}
266
229
}
267
230
let unchanged_slice = self . bytes ;
@@ -282,7 +245,7 @@ impl<'a, E: EncodeSet> Iterator for PercentEncode<'a, E> {
282
245
}
283
246
}
284
247
285
- impl < ' a , E : EncodeSet > fmt:: Display for PercentEncode < ' a , E > {
248
+ impl < ' a > fmt:: Display for PercentEncode < ' a > {
286
249
fn fmt ( & self , formatter : & mut fmt:: Formatter ) -> fmt:: Result {
287
250
for c in ( * self ) . clone ( ) {
288
251
formatter. write_str ( c) ?
@@ -291,8 +254,8 @@ impl<'a, E: EncodeSet> fmt::Display for PercentEncode<'a, E> {
291
254
}
292
255
}
293
256
294
- impl < ' a , E : EncodeSet > From < PercentEncode < ' a , E > > for Cow < ' a , str > {
295
- fn from ( mut iter : PercentEncode < ' a , E > ) -> Self {
257
+ impl < ' a > From < PercentEncode < ' a > > for Cow < ' a , str > {
258
+ fn from ( mut iter : PercentEncode < ' a > ) -> Self {
296
259
match iter. next ( ) {
297
260
None => "" . into ( ) ,
298
261
Some ( first) => match iter. next ( ) {
0 commit comments