Skip to content

Commit e1e3692

Browse files
brucegBurntSushi
authored andcommitted
capture: support [, ] and . in capture group names
This slightly expands the set of characters allowed in capture group names to be `[][_0-9A-Za-z.]` from `[_0-9A-Za-z]`. This required some delicacy in order to avoid replacement strings like `$Z[` from referring to invalid capture group names where the intent was to refer to the capture group named `Z`. That is, in order to use `[`, `]` or `.` in a capture group name, one must use the explicit brace syntax: `${Z[}`. We clarify the docs around this issue. Regretably, we are not much closer to handling #595. In order to support, say, all Unicode word characters, our replacement parser would need to become UTF-8 aware on `&[u8]`. But std makes this difficult and I would prefer not to add another dependency on ad hoc UTF-8 decoding or a dependency on another crate. Closes #649
1 parent 96456dd commit e1e3692

File tree

6 files changed

+111
-31
lines changed

6 files changed

+111
-31
lines changed

regex-syntax/src/ast/parse.rs

+44-4
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,13 @@ fn is_hex(c: char) -> bool {
9898
/// Returns true if the given character is a valid in a capture group name.
9999
///
100100
/// If `first` is true, then `c` is treated as the first character in the
101-
/// group name (which is not allowed to be a digit).
101+
/// group name (which must be alphabetic or underscore).
102102
fn is_capture_char(c: char, first: bool) -> bool {
103103
c == '_'
104-
|| (!first && c >= '0' && c <= '9')
105-
|| (c >= 'a' && c <= 'z')
106-
|| (c >= 'A' && c <= 'Z')
104+
|| (!first
105+
&& (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
106+
|| ('A' <= c && c <= 'Z')
107+
|| ('a' <= c && c <= 'z')
107108
}
108109

109110
/// A builder for a regular expression parser.
@@ -3851,6 +3852,45 @@ bar
38513852
}))
38523853
);
38533854

3855+
assert_eq!(
3856+
parser("(?P<a_1>z)").parse(),
3857+
Ok(Ast::Group(ast::Group {
3858+
span: span(0..10),
3859+
kind: ast::GroupKind::CaptureName(ast::CaptureName {
3860+
span: span(4..7),
3861+
name: s("a_1"),
3862+
index: 1,
3863+
}),
3864+
ast: Box::new(lit('z', 8)),
3865+
}))
3866+
);
3867+
3868+
assert_eq!(
3869+
parser("(?P<a.1>z)").parse(),
3870+
Ok(Ast::Group(ast::Group {
3871+
span: span(0..10),
3872+
kind: ast::GroupKind::CaptureName(ast::CaptureName {
3873+
span: span(4..7),
3874+
name: s("a.1"),
3875+
index: 1,
3876+
}),
3877+
ast: Box::new(lit('z', 8)),
3878+
}))
3879+
);
3880+
3881+
assert_eq!(
3882+
parser("(?P<a[1]>z)").parse(),
3883+
Ok(Ast::Group(ast::Group {
3884+
span: span(0..11),
3885+
kind: ast::GroupKind::CaptureName(ast::CaptureName {
3886+
span: span(4..8),
3887+
name: s("a[1]"),
3888+
index: 1,
3889+
}),
3890+
ast: Box::new(lit('z', 9)),
3891+
}))
3892+
);
3893+
38543894
assert_eq!(
38553895
parser("(?P<").parse().unwrap_err(),
38563896
TestError {

src/expand.rs

+34-16
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ pub fn expand_str(
2424
continue;
2525
}
2626
debug_assert!(!replacement.is_empty());
27-
let cap_ref = match find_cap_ref(replacement) {
27+
let cap_ref = match find_cap_ref(replacement.as_bytes()) {
2828
Some(cap_ref) => cap_ref,
2929
None => {
3030
dst.push_str("$");
@@ -125,19 +125,15 @@ impl From<usize> for Ref<'static> {
125125
/// starting at the beginning of `replacement`.
126126
///
127127
/// If no such valid reference could be found, None is returned.
128-
fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
129-
replacement: &T,
130-
) -> Option<CaptureRef> {
128+
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
131129
let mut i = 0;
132130
let rep: &[u8] = replacement.as_ref();
133131
if rep.len() <= 1 || rep[0] != b'$' {
134132
return None;
135133
}
136-
let mut brace = false;
137134
i += 1;
138135
if rep[i] == b'{' {
139-
brace = true;
140-
i += 1;
136+
return find_cap_ref_braced(rep, i + 1);
141137
}
142138
let mut cap_end = i;
143139
while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
@@ -151,12 +147,6 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
151147
// check with either unsafe or by parsing the number straight from &[u8].
152148
let cap =
153149
str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
154-
if brace {
155-
if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
156-
return None;
157-
}
158-
cap_end += 1;
159-
}
160150
Some(CaptureRef {
161151
cap: match cap.parse::<u32>() {
162152
Ok(i) => Ref::Number(i as usize),
@@ -166,6 +156,31 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
166156
})
167157
}
168158

159+
fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> {
160+
let start = i;
161+
while rep.get(i).map_or(false, |&b| b != b'}') {
162+
i += 1;
163+
}
164+
if !rep.get(i).map_or(false, |&b| b == b'}') {
165+
return None;
166+
}
167+
// When looking at braced names, we don't put any restrictions on the name,
168+
// so it's possible it could be invalid UTF-8. But a capture group name
169+
// can never be invalid UTF-8, so if we have invalid UTF-8, then we can
170+
// safely return None.
171+
let cap = match str::from_utf8(&rep[start..i]) {
172+
Err(_) => return None,
173+
Ok(cap) => cap,
174+
};
175+
Some(CaptureRef {
176+
cap: match cap.parse::<u32>() {
177+
Ok(i) => Ref::Number(i as usize),
178+
Err(_) => Ref::Named(cap),
179+
},
180+
end: i + 1,
181+
})
182+
}
183+
169184
/// Returns true if and only if the given byte is allowed in a capture name.
170185
fn is_valid_cap_letter(b: &u8) -> bool {
171186
match *b {
@@ -182,13 +197,13 @@ mod tests {
182197
($name:ident, $text:expr) => {
183198
#[test]
184199
fn $name() {
185-
assert_eq!(None, find_cap_ref($text));
200+
assert_eq!(None, find_cap_ref($text.as_bytes()));
186201
}
187202
};
188203
($name:ident, $text:expr, $capref:expr) => {
189204
#[test]
190205
fn $name() {
191-
assert_eq!(Some($capref), find_cap_ref($text));
206+
assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
192207
}
193208
};
194209
}
@@ -204,7 +219,8 @@ mod tests {
204219
find!(find_cap_ref3, "$0", c!(0, 2));
205220
find!(find_cap_ref4, "$5", c!(5, 2));
206221
find!(find_cap_ref5, "$10", c!(10, 3));
207-
// see https://github.com/rust-lang/regex/pull/585 for more on characters following numbers
222+
// See https://github.com/rust-lang/regex/pull/585
223+
// for more on characters following numbers
208224
find!(find_cap_ref6, "$42a", c!("42a", 4));
209225
find!(find_cap_ref7, "${42}a", c!(42, 5));
210226
find!(find_cap_ref8, "${42");
@@ -217,4 +233,6 @@ mod tests {
217233
find!(find_cap_ref15, "$1_$2", c!("1_", 3));
218234
find!(find_cap_ref16, "$x-$y", c!("x", 2));
219235
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
236+
find!(find_cap_ref18, "${#}", c!("#", 4));
237+
find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
220238
}

src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ $ the end of text (or end-of-line with multi-line mode)
365365
366366
<pre class="rust">
367367
(exp) numbered capture group (indexed by opening parenthesis)
368-
(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
368+
(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
369369
(?:exp) non-capturing group
370370
(?flags) set flags within current group
371371
(?flags:exp) set flags for exp (non-capturing)

src/re_bytes.rs

+10-5
Original file line numberDiff line numberDiff line change
@@ -930,17 +930,22 @@ impl<'t> Captures<'t> {
930930
/// Expands all instances of `$name` in `replacement` to the corresponding
931931
/// capture group `name`, and writes them to the `dst` buffer given.
932932
///
933-
/// `name` may be an integer corresponding to the index of the
934-
/// capture group (counted by order of opening parenthesis where `0` is the
933+
/// `name` may be an integer corresponding to the index of the capture
934+
/// group (counted by order of opening parenthesis where `0` is the
935935
/// entire match) or it can be a name (consisting of letters, digits or
936936
/// underscores) corresponding to a named capture group.
937937
///
938938
/// If `name` isn't a valid capture group (whether the name doesn't exist
939939
/// or isn't a valid index), then it is replaced with the empty string.
940940
///
941-
/// The longest possible name is used. e.g., `$1a` looks up the capture
942-
/// group named `1a` and not the capture group at index `1`. To exert more
943-
/// precise control over the name, use braces, e.g., `${1}a`.
941+
/// The longest possible name consisting of the characters `[_0-9A-Za-z]`
942+
/// is used. e.g., `$1a` looks up the capture group named `1a` and not the
943+
/// capture group at index `1`. To exert more precise control over the
944+
/// name, or to refer to a capture group name that uses characters outside
945+
/// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
946+
/// using braces, any sequence of valid UTF-8 bytes is permitted. If the
947+
/// sequence does not refer to a capture group name in the corresponding
948+
/// regex, then it is replaced with an empty string.
944949
///
945950
/// To write a literal `$` use `$$`.
946951
pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {

src/re_unicode.rs

+10-5
Original file line numberDiff line numberDiff line change
@@ -947,17 +947,22 @@ impl<'t> Captures<'t> {
947947
/// Expands all instances of `$name` in `replacement` to the corresponding
948948
/// capture group `name`, and writes them to the `dst` buffer given.
949949
///
950-
/// `name` may be an integer corresponding to the index of the
951-
/// capture group (counted by order of opening parenthesis where `0` is the
950+
/// `name` may be an integer corresponding to the index of the capture
951+
/// group (counted by order of opening parenthesis where `0` is the
952952
/// entire match) or it can be a name (consisting of letters, digits or
953953
/// underscores) corresponding to a named capture group.
954954
///
955955
/// If `name` isn't a valid capture group (whether the name doesn't exist
956956
/// or isn't a valid index), then it is replaced with the empty string.
957957
///
958-
/// The longest possible name is used. e.g., `$1a` looks up the capture
959-
/// group named `1a` and not the capture group at index `1`. To exert more
960-
/// precise control over the name, use braces, e.g., `${1}a`.
958+
/// The longest possible name consisting of the characters `[_0-9A-Za-z]`
959+
/// is used. e.g., `$1a` looks up the capture group named `1a` and not the
960+
/// capture group at index `1`. To exert more precise control over the
961+
/// name, or to refer to a capture group name that uses characters outside
962+
/// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
963+
/// using braces, any sequence of characters is permitted. If the sequence
964+
/// does not refer to a capture group name in the corresponding regex, then
965+
/// it is replaced with an empty string.
961966
///
962967
/// To write a literal `$` use `$$`.
963968
pub fn expand(&self, replacement: &str, dst: &mut String) {

tests/api.rs

+12
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,18 @@ expand!(
195195
);
196196
expand!(expand10, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$bz$az", "");
197197

198+
expand!(expand_name1, r"%(?P<Z>[a-z]+)", "%abc", "$Z%", "abc%");
199+
expand!(expand_name2, r"\[(?P<Z>[a-z]+)", "[abc", "$Z[", "abc[");
200+
expand!(expand_name3, r"\{(?P<Z>[a-z]+)", "{abc", "$Z{", "abc{");
201+
expand!(expand_name4, r"\}(?P<Z>[a-z]+)", "}abc", "$Z}", "abc}");
202+
expand!(expand_name5, r"%([a-z]+)", "%abc", "$1a%", "%");
203+
expand!(expand_name6, r"%([a-z]+)", "%abc", "${1}a%", "abca%");
204+
expand!(expand_name7, r"\[(?P<Z[>[a-z]+)", "[abc", "${Z[}[", "abc[");
205+
expand!(expand_name8, r"\[(?P<Z[>[a-z]+)", "[abc", "${foo}[", "[");
206+
expand!(expand_name9, r"\[(?P<Z[>[a-z]+)", "[abc", "${1a}[", "[");
207+
expand!(expand_name10, r"\[(?P<Z[>[a-z]+)", "[abc", "${#}[", "[");
208+
expand!(expand_name11, r"\[(?P<Z[>[a-z]+)", "[abc", "${$$}[", "[");
209+
198210
split!(
199211
split1,
200212
r"(?-u)\s+",

0 commit comments

Comments
 (0)