Skip to content

Commit bc4503b

Browse files
committed
percent-encoding: make sets be values of one type, instead of types that implement a trait
1 parent 6c76f2e commit bc4503b

File tree

7 files changed

+143
-202
lines changed

7 files changed

+143
-202
lines changed

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ bencher = "0.1"
3939
[dependencies]
4040
idna = { version = "0.2.0", path = "./idna" }
4141
matches = "0.1"
42-
percent-encoding = { version = "1.0.0", path = "./percent_encoding" }
42+
percent-encoding = { version = "2.0.0", path = "./percent_encoding" }
4343
serde = {version = "1.0", optional = true}
4444

4545
[[bench]]

percent_encoding/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "percent-encoding"
3-
version = "1.0.2"
3+
version = "2.0.0"
44
authors = ["The rust-url developers"]
55
description = "Percent encoding and decoding"
66
repository = "https://github.com/servo/rust-url/"

percent_encoding/lib.rs

+76-113
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ use std::fmt;
3737
use std::slice;
3838
use std::str;
3939

40-
/// Represents a set of characters / bytes that should be percent-encoded.
40+
/// Represents a set of characters or bytes in the ASCII range.
4141
///
42+
/// This is use to represent which characters should be pecent-encoded.
4243
/// See [encode sets specification](http://url.spec.whatwg.org/#simple-encode-set).
4344
///
4445
/// Different characters need to be encoded in different parts of an URL.
@@ -50,116 +51,81 @@ use std::str;
5051
///
5152
/// A few sets are defined in this module.
5253
/// Use the [`define_encode_set!`](../macro.define_encode_set!.html) macro to define different ones.
53-
pub trait EncodeSet: Clone {
54-
/// Called with UTF-8 bytes rather than code points.
55-
/// Should return true for all non-ASCII bytes.
56-
fn contains(&self, byte: u8) -> bool;
54+
pub struct AsciiSet {
55+
mask: [Chunk; 0x80 / BITS_PER_CHUNK],
5756
}
5857

59-
/// Define a new struct
60-
/// that implements the [`EncodeSet`](percent_encoding/trait.EncodeSet.html) trait,
61-
/// for use in [`percent_decode()`](percent_encoding/fn.percent_encode.html)
62-
/// and related functions.
63-
///
64-
/// Parameters are characters to include in the set in addition to those of the base set.
65-
/// See [encode sets specification](http://url.spec.whatwg.org/#simple-encode-set).
66-
///
67-
/// Example
68-
/// =======
69-
///
70-
/// ```rust
71-
/// #[macro_use] extern crate percent_encoding;
72-
/// use percent_encoding::{utf8_percent_encode, SIMPLE_ENCODE_SET};
73-
/// define_encode_set! {
74-
/// /// This encode set is used in the URL parser for query strings.
75-
/// pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'}
76-
/// }
77-
/// # fn main() {
78-
/// assert_eq!(utf8_percent_encode("foo bar", QUERY_ENCODE_SET).collect::<String>(), "foo%20bar");
79-
/// # }
80-
/// ```
81-
#[macro_export]
82-
macro_rules! define_encode_set {
83-
($(#[$attr: meta])* pub $name: ident = [$base_set: expr] | {$($ch: pat),*}) => {
84-
$(#[$attr])*
85-
#[derive(Copy, Clone, Debug)]
86-
#[allow(non_camel_case_types)]
87-
pub struct $name;
88-
89-
impl $crate::EncodeSet for $name {
90-
#[inline]
91-
fn contains(&self, byte: u8) -> bool {
92-
match byte as char {
93-
$(
94-
$ch => true,
95-
)*
96-
_ => $base_set.contains(byte)
97-
}
98-
}
99-
}
58+
type Chunk = u32;
59+
60+
const BITS_PER_CHUNK: usize = 8 * std::mem::size_of::<Chunk>();
61+
62+
impl AsciiSet {
63+
/// Called with UTF-8 bytes rather than code points.
64+
/// Not used for non-ASCII bytes.
65+
const fn contains(&self, byte: u8) -> bool {
66+
let chunk = self.mask[byte as usize / BITS_PER_CHUNK];
67+
let mask = 1 << (byte as usize % BITS_PER_CHUNK);
68+
(chunk & mask) != 0
10069
}
101-
}
10270

103-
/// This encode set is used for the path of cannot-be-a-base URLs.
104-
///
105-
/// All ASCII charcters less than hexidecimal 20 and greater than 7E are encoded. This includes
106-
/// special charcters such as line feed, carriage return, NULL, etc.
107-
#[derive(Copy, Clone, Debug)]
108-
#[allow(non_camel_case_types)]
109-
pub struct SIMPLE_ENCODE_SET;
110-
111-
impl EncodeSet for SIMPLE_ENCODE_SET {
112-
#[inline]
113-
fn contains(&self, byte: u8) -> bool {
114-
byte < 0x20 || byte > 0x7E
71+
fn should_percent_encode(&self, byte: u8) -> bool {
72+
!byte.is_ascii() || self.contains(byte)
11573
}
116-
}
11774

118-
define_encode_set! {
119-
/// This encode set is used in the URL parser for query strings.
120-
///
121-
/// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html),
122-
/// space, double quote ("), hash (#), and inequality qualifiers (<), (>) are encoded.
123-
pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'}
75+
pub const fn add(&self, byte: u8) -> Self {
76+
let mut mask = self.mask;
77+
mask[byte as usize / BITS_PER_CHUNK] |= 1 << (byte as usize % BITS_PER_CHUNK);
78+
AsciiSet { mask }
79+
}
12480
}
12581

126-
define_encode_set! {
127-
/// This encode set is used for path components.
128-
///
129-
/// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html),
130-
/// space, double quote ("), hash (#), inequality qualifiers (<), (>), backtick (`),
131-
/// question mark (?), and curly brackets ({), (}) are encoded.
132-
pub DEFAULT_ENCODE_SET = [QUERY_ENCODE_SET] | {'`', '?', '{', '}'}
133-
}
82+
/// https://url.spec.whatwg.org/#c0-control-percent-encode-set
83+
pub const C0_CONTROL: &AsciiSet = &AsciiSet {
84+
mask: [
85+
!0_u32, // C0: 0x00 to 0x1F (32 bits set)
86+
0,
87+
0,
88+
1 << (0x7F_u32 % 32), // DEL: 0x7F (one bit set)
89+
],
90+
};
13491

135-
define_encode_set! {
136-
/// This encode set is used for on '/'-separated path segment
137-
///
138-
/// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html),
139-
/// space, double quote ("), hash (#), inequality qualifiers (<), (>), backtick (`),
140-
/// question mark (?), and curly brackets ({), (}), percent sign (%), forward slash (/) are
141-
/// encoded.
142-
///
143-
/// # Note
144-
///
145-
/// For [special URLs](https://url.spec.whatwg.org/#is-special), the backslash (\) character should
146-
/// additionally be escaped, but that is *not* included in this encode set.
147-
pub PATH_SEGMENT_ENCODE_SET = [DEFAULT_ENCODE_SET] | {'%', '/'}
148-
}
92+
/// https://url.spec.whatwg.org/#fragment-percent-encode-set
93+
pub const FRAGMENT: &AsciiSet = &C0_CONTROL.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
14994

150-
define_encode_set! {
151-
/// This encode set is used for username and password.
152-
///
153-
/// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html),
154-
/// space, double quote ("), hash (#), inequality qualifiers (<), (>), backtick (`),
155-
/// question mark (?), and curly brackets ({), (}), forward slash (/), colon (:), semi-colon (;),
156-
/// equality (=), at (@), backslash (\\), square brackets ([), (]), caret (\^), and pipe (|) are
157-
/// encoded.
158-
pub USERINFO_ENCODE_SET = [DEFAULT_ENCODE_SET] | {
159-
'/', ':', ';', '=', '@', '[', '\\', ']', '^', '|'
95+
/// https://url.spec.whatwg.org/#path-percent-encode-set
96+
pub const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}');
97+
98+
/// https://url.spec.whatwg.org/#userinfo-percent-encode-set
99+
pub const USERINFO: &AsciiSet = &PATH
100+
.add(b'/')
101+
.add(b':')
102+
.add(b';')
103+
.add(b'=')
104+
.add(b'@')
105+
.add(b'[')
106+
.add(b'\\')
107+
.add(b']')
108+
.add(b'^')
109+
.add(b'|');
110+
111+
macro_rules! static_assert {
112+
($( $bool: expr, )+) => {
113+
fn _static_assert() {
114+
$(
115+
let _ = std::mem::transmute::<[u8; $bool as usize], u8>;
116+
)+
117+
}
160118
}
161119
}
162120

121+
static_assert! {
122+
C0_CONTROL.contains(0x00),
123+
C0_CONTROL.contains(0x1F),
124+
!C0_CONTROL.contains(0x20),
125+
!C0_CONTROL.contains(0x7E),
126+
C0_CONTROL.contains(0x7F),
127+
}
128+
163129
/// Return the percent-encoding of the given bytes.
164130
///
165131
/// This is unconditional, unlike `percent_encode()` which uses an encode set.
@@ -214,10 +180,10 @@ pub fn percent_encode_byte(byte: u8) -> &'static str {
214180
/// assert_eq!(percent_encode(b"foo bar?", DEFAULT_ENCODE_SET).to_string(), "foo%20bar%3F");
215181
/// ```
216182
#[inline]
217-
pub fn percent_encode<E: EncodeSet>(input: &[u8], encode_set: E) -> PercentEncode<E> {
183+
pub fn percent_encode<'a>(input: &'a [u8], encode_set: &'static AsciiSet) -> PercentEncode<'a> {
218184
PercentEncode {
219185
bytes: input,
220-
encode_set: encode_set,
186+
encode_set,
221187
}
222188
}
223189

@@ -233,35 +199,32 @@ pub fn percent_encode<E: EncodeSet>(input: &[u8], encode_set: E) -> PercentEncod
233199
/// assert_eq!(utf8_percent_encode("foo bar?", DEFAULT_ENCODE_SET).to_string(), "foo%20bar%3F");
234200
/// ```
235201
#[inline]
236-
pub fn utf8_percent_encode<E: EncodeSet>(input: &str, encode_set: E) -> PercentEncode<E> {
202+
pub fn utf8_percent_encode<'a>(input: &'a str, encode_set: &'static AsciiSet) -> PercentEncode<'a> {
237203
percent_encode(input.as_bytes(), encode_set)
238204
}
239205

240206
/// The return type of `percent_encode()` and `utf8_percent_encode()`.
241-
#[derive(Clone, Debug)]
242-
pub struct PercentEncode<'a, E: EncodeSet> {
207+
#[derive(Clone)]
208+
pub struct PercentEncode<'a> {
243209
bytes: &'a [u8],
244-
encode_set: E,
210+
encode_set: &'static AsciiSet,
245211
}
246212

247-
impl<'a, E: EncodeSet> Iterator for PercentEncode<'a, E> {
213+
impl<'a> Iterator for PercentEncode<'a> {
248214
type Item = &'a str;
249215

250216
fn next(&mut self) -> Option<&'a str> {
251217
if let Some((&first_byte, remaining)) = self.bytes.split_first() {
252-
if self.encode_set.contains(first_byte) {
218+
if self.encode_set.should_percent_encode(first_byte) {
253219
self.bytes = remaining;
254220
Some(percent_encode_byte(first_byte))
255221
} else {
256-
assert!(first_byte.is_ascii());
257222
for (i, &byte) in remaining.iter().enumerate() {
258-
if self.encode_set.contains(byte) {
223+
if self.encode_set.should_percent_encode(byte) {
259224
// 1 for first_byte + i for previous iterations of this loop
260225
let (unchanged_slice, remaining) = self.bytes.split_at(1 + i);
261226
self.bytes = remaining;
262227
return Some(unsafe { str::from_utf8_unchecked(unchanged_slice) });
263-
} else {
264-
assert!(byte.is_ascii());
265228
}
266229
}
267230
let unchanged_slice = self.bytes;
@@ -282,7 +245,7 @@ impl<'a, E: EncodeSet> Iterator for PercentEncode<'a, E> {
282245
}
283246
}
284247

285-
impl<'a, E: EncodeSet> fmt::Display for PercentEncode<'a, E> {
248+
impl<'a> fmt::Display for PercentEncode<'a> {
286249
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
287250
for c in (*self).clone() {
288251
formatter.write_str(c)?
@@ -291,8 +254,8 @@ impl<'a, E: EncodeSet> fmt::Display for PercentEncode<'a, E> {
291254
}
292255
}
293256

294-
impl<'a, E: EncodeSet> From<PercentEncode<'a, E>> for Cow<'a, str> {
295-
fn from(mut iter: PercentEncode<'a, E>) -> Self {
257+
impl<'a> From<PercentEncode<'a>> for Cow<'a, str> {
258+
fn from(mut iter: PercentEncode<'a>) -> Self {
296259
match iter.next() {
297260
None => "".into(),
298261
Some(first) => match iter.next() {

src/host.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
use idna;
1010
use parser::{ParseError, ParseResult};
11-
use percent_encoding::{percent_decode, utf8_percent_encode, SIMPLE_ENCODE_SET};
11+
use percent_encoding::{percent_decode, utf8_percent_encode, C0_CONTROL};
1212
use std::cmp;
1313
use std::fmt::{self, Formatter};
1414
use std::net::{Ipv4Addr, Ipv6Addr};
@@ -207,7 +207,7 @@ impl Host<String> {
207207
{
208208
return Err(ParseError::InvalidDomainCharacter);
209209
}
210-
let s = utf8_percent_encode(input, SIMPLE_ENCODE_SET).to_string();
210+
let s = utf8_percent_encode(input, C0_CONTROL).to_string();
211211
Ok(Host::Domain(s))
212212
}
213213
}

src/lib.rs

+11-15
Original file line numberDiff line numberDiff line change
@@ -110,17 +110,13 @@ assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css");
110110
#[macro_use]
111111
extern crate matches;
112112
extern crate idna;
113+
extern crate percent_encoding;
113114
#[cfg(feature = "serde")]
114115
extern crate serde;
115-
#[macro_use]
116-
extern crate percent_encoding;
117116

118117
use host::HostInternal;
119118
use parser::{to_u32, Context, Parser, SchemeType};
120-
use percent_encoding::{
121-
percent_decode, percent_encode, utf8_percent_encode, PATH_SEGMENT_ENCODE_SET,
122-
USERINFO_ENCODE_SET,
123-
};
119+
use percent_encoding::{percent_decode, percent_encode, utf8_percent_encode, USERINFO};
124120
use std::borrow::Borrow;
125121
use std::cmp;
126122
#[cfg(feature = "serde")]
@@ -1229,8 +1225,9 @@ impl Url {
12291225
if let Some(input) = query {
12301226
self.query_start = Some(to_u32(self.serialization.len()).unwrap());
12311227
self.serialization.push('?');
1228+
let scheme_type = SchemeType::from(self.scheme());
12321229
let scheme_end = self.scheme_end;
1233-
self.mutate(|parser| parser.parse_query(scheme_end, parser::Input::new(input)));
1230+
self.mutate(|parser| parser.parse_query(scheme_type, scheme_end, parser::Input::new(input)));
12341231
}
12351232

12361233
self.restore_already_parsed_fragment(fragment);
@@ -1729,7 +1726,7 @@ impl Url {
17291726
self.serialization.truncate(self.username_end as usize);
17301727
self.serialization.push(':');
17311728
self.serialization
1732-
.extend(utf8_percent_encode(password, USERINFO_ENCODE_SET));
1729+
.extend(utf8_percent_encode(password, USERINFO));
17331730
self.serialization.push('@');
17341731

17351732
let old_host_start = self.host_start;
@@ -1824,7 +1821,7 @@ impl Url {
18241821
let after_username = self.slice(self.username_end..).to_owned();
18251822
self.serialization.truncate(username_start as usize);
18261823
self.serialization
1827-
.extend(utf8_percent_encode(username, USERINFO_ENCODE_SET));
1824+
.extend(utf8_percent_encode(username, USERINFO));
18281825

18291826
let mut removed_bytes = self.username_end;
18301827
self.username_end = to_u32(self.serialization.len()).unwrap();
@@ -2290,6 +2287,8 @@ impl<'de> serde::Deserialize<'de> for Url {
22902287
}
22912288
}
22922289

2290+
const PATH_SEGMENT: &percent_encoding::AsciiSet = &percent_encoding::PATH.add(b'/');
2291+
22932292
#[cfg(any(unix, target_os = "redox"))]
22942293
fn path_to_file_url_segments(
22952294
path: &Path,
@@ -2307,7 +2306,7 @@ fn path_to_file_url_segments(
23072306
serialization.push('/');
23082307
serialization.extend(percent_encode(
23092308
component.as_os_str().as_bytes(),
2310-
PATH_SEGMENT_ENCODE_SET,
2309+
PATH_SEGMENT,
23112310
));
23122311
}
23132312
if empty {
@@ -2355,7 +2354,7 @@ fn path_to_file_url_segments_windows(
23552354
host_internal = host.into();
23562355
serialization.push('/');
23572356
let share = share.to_str().ok_or(())?;
2358-
serialization.extend(percent_encode(share.as_bytes(), PATH_SEGMENT_ENCODE_SET));
2357+
serialization.extend(percent_encode(share.as_bytes(), PATH_SEGMENT));
23592358
}
23602359
_ => return Err(()),
23612360
},
@@ -2370,10 +2369,7 @@ fn path_to_file_url_segments_windows(
23702369
// FIXME: somehow work with non-unicode?
23712370
let component = component.as_os_str().to_str().ok_or(())?;
23722371
serialization.push('/');
2373-
serialization.extend(percent_encode(
2374-
component.as_bytes(),
2375-
PATH_SEGMENT_ENCODE_SET,
2376-
));
2372+
serialization.extend(percent_encode(component.as_bytes(), PATH_SEGMENT));
23772373
}
23782374
Ok((host_end, host_internal))
23792375
}

0 commit comments

Comments
 (0)