capture: support [, ] and . in capture group names

bruceg · BurntSushi · commit e1e36925ca75 · 2020-10-11T20:08:30.000-04:00
This slightly expands the set of characters allowed in capture group names to be `[][_0-9A-Za-z.]` from `[_0-9A-Za-z]`. This required some delicacy in order to avoid replacement strings like `$Z[` from referring to invalid capture group names where the intent was to refer to the capture group named `Z`. That is, in order to use `[`, `]` or `.` in a capture group name, one must use the explicit brace syntax: `${Z[}`. We clarify the docs around this issue. Regretably, we are not much closer to handling #595. In order to support, say, all Unicode word characters, our replacement parser would need to become UTF-8 aware on `&[u8]`. But std makes this difficult and I would prefer not to add another dependency on ad hoc UTF-8 decoding or a dependency on another crate. Closes #649
diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
@@ -98,12 +98,13 @@ fn is_hex(c: char) -> bool {
 /// Returns true if the given character is a valid in a capture group name.
 ///
 /// If `first` is true, then `c` is treated as the first character in the
-/// group name (which is not allowed to be a digit).
+/// group name (which must be alphabetic or underscore).
 fn is_capture_char(c: char, first: bool) -> bool {
     c == '_'
-        || (!first && c >= '0' && c <= '9')
-        || (c >= 'a' && c <= 'z')
-        || (c >= 'A' && c <= 'Z')
+        || (!first
+            && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
+        || ('A' <= c && c <= 'Z')
+        || ('a' <= c && c <= 'z')
 }
 
 /// A builder for a regular expression parser.
@@ -3851,6 +3852,45 @@ bar
             }))
         );
 
+        assert_eq!(
+            parser("(?P<a_1>z)").parse(),
+            Ok(Ast::Group(ast::Group {
+                span: span(0..10),
+                kind: ast::GroupKind::CaptureName(ast::CaptureName {
+                    span: span(4..7),
+                    name: s("a_1"),
+                    index: 1,
+                }),
+                ast: Box::new(lit('z', 8)),
+            }))
+        );
+
+        assert_eq!(
+            parser("(?P<a.1>z)").parse(),
+            Ok(Ast::Group(ast::Group {
+                span: span(0..10),
+                kind: ast::GroupKind::CaptureName(ast::CaptureName {
+                    span: span(4..7),
+                    name: s("a.1"),
+                    index: 1,
+                }),
+                ast: Box::new(lit('z', 8)),
+            }))
+        );
+
+        assert_eq!(
+            parser("(?P<a[1]>z)").parse(),
+            Ok(Ast::Group(ast::Group {
+                span: span(0..11),
+                kind: ast::GroupKind::CaptureName(ast::CaptureName {
+                    span: span(4..8),
+                    name: s("a[1]"),
+                    index: 1,
+                }),
+                ast: Box::new(lit('z', 9)),
+            }))
+        );
+
         assert_eq!(
             parser("(?P<").parse().unwrap_err(),
             TestError {
diff --git a/src/expand.rs b/src/expand.rs
@@ -24,7 +24,7 @@ pub fn expand_str(
             continue;
         }
         debug_assert!(!replacement.is_empty());
-        let cap_ref = match find_cap_ref(replacement) {
+        let cap_ref = match find_cap_ref(replacement.as_bytes()) {
             Some(cap_ref) => cap_ref,
             None => {
                 dst.push_str("$");
@@ -125,19 +125,15 @@ impl From<usize> for Ref<'static> {
 /// starting at the beginning of `replacement`.
 ///
 /// If no such valid reference could be found, None is returned.
-fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
-    replacement: &T,
-) -> Option<CaptureRef> {
+fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
     let mut i = 0;
     let rep: &[u8] = replacement.as_ref();
     if rep.len() <= 1 || rep[0] != b'$' {
         return None;
     }
-    let mut brace = false;
     i += 1;
     if rep[i] == b'{' {
-        brace = true;
-        i += 1;
+        return find_cap_ref_braced(rep, i + 1);
     }
     let mut cap_end = i;
     while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
@@ -151,12 +147,6 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
     // check with either unsafe or by parsing the number straight from &[u8].
     let cap =
         str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
-    if brace {
-        if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
-            return None;
-        }
-        cap_end += 1;
-    }
     Some(CaptureRef {
         cap: match cap.parse::<u32>() {
             Ok(i) => Ref::Number(i as usize),
@@ -166,6 +156,31 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
     })
 }
 
+fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> {
+    let start = i;
+    while rep.get(i).map_or(false, |&b| b != b'}') {
+        i += 1;
+    }
+    if !rep.get(i).map_or(false, |&b| b == b'}') {
+        return None;
+    }
+    // When looking at braced names, we don't put any restrictions on the name,
+    // so it's possible it could be invalid UTF-8. But a capture group name
+    // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
+    // safely return None.
+    let cap = match str::from_utf8(&rep[start..i]) {
+        Err(_) => return None,
+        Ok(cap) => cap,
+    };
+    Some(CaptureRef {
+        cap: match cap.parse::<u32>() {
+            Ok(i) => Ref::Number(i as usize),
+            Err(_) => Ref::Named(cap),
+        },
+        end: i + 1,
+    })
+}
+
 /// Returns true if and only if the given byte is allowed in a capture name.
 fn is_valid_cap_letter(b: &u8) -> bool {
     match *b {
@@ -182,13 +197,13 @@ mod tests {
         ($name:ident, $text:expr) => {
             #[test]
             fn $name() {
-                assert_eq!(None, find_cap_ref($text));
+                assert_eq!(None, find_cap_ref($text.as_bytes()));
             }
         };
         ($name:ident, $text:expr, $capref:expr) => {
             #[test]
             fn $name() {
-                assert_eq!(Some($capref), find_cap_ref($text));
+                assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
             }
         };
     }
@@ -204,7 +219,8 @@ mod tests {
     find!(find_cap_ref3, "$0", c!(0, 2));
     find!(find_cap_ref4, "$5", c!(5, 2));
     find!(find_cap_ref5, "$10", c!(10, 3));
-    // see https://github.com/rust-lang/regex/pull/585 for more on characters following numbers
+    // See https://github.com/rust-lang/regex/pull/585
+    // for more on characters following numbers
     find!(find_cap_ref6, "$42a", c!("42a", 4));
     find!(find_cap_ref7, "${42}a", c!(42, 5));
     find!(find_cap_ref8, "${42");
@@ -217,4 +233,6 @@ mod tests {
     find!(find_cap_ref15, "$1_$2", c!("1_", 3));
     find!(find_cap_ref16, "$x-$y", c!("x", 2));
     find!(find_cap_ref17, "$x_$y", c!("x_", 3));
+    find!(find_cap_ref18, "${#}", c!("#", 4));
+    find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -365,7 +365,7 @@ $     the end of text (or end-of-line with multi-line mode)
 
 <pre class="rust">
 (exp)          numbered capture group (indexed by opening parenthesis)
-(?P&lt;name&gt;exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
+(?P&lt;name&gt;exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
 (?:exp)        non-capturing group
 (?flags)       set flags within current group
 (?flags:exp)   set flags for exp (non-capturing)
diff --git a/src/re_bytes.rs b/src/re_bytes.rs
@@ -930,17 +930,22 @@ impl<'t> Captures<'t> {
     /// Expands all instances of `$name` in `replacement` to the corresponding
     /// capture group `name`, and writes them to the `dst` buffer given.
     ///
-    /// `name` may be an integer corresponding to the index of the
-    /// capture group (counted by order of opening parenthesis where `0` is the
+    /// `name` may be an integer corresponding to the index of the capture
+    /// group (counted by order of opening parenthesis where `0` is the
     /// entire match) or it can be a name (consisting of letters, digits or
     /// underscores) corresponding to a named capture group.
     ///
     /// If `name` isn't a valid capture group (whether the name doesn't exist
     /// or isn't a valid index), then it is replaced with the empty string.
     ///
-    /// The longest possible name is used. e.g., `$1a` looks up the capture
-    /// group named `1a` and not the capture group at index `1`. To exert more
-    /// precise control over the name, use braces, e.g., `${1}a`.
+    /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+    /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+    /// capture group at index `1`. To exert more precise control over the
+    /// name, or to refer to a capture group name that uses characters outside
+    /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+    /// using braces, any sequence of valid UTF-8 bytes is permitted. If the
+    /// sequence does not refer to a capture group name in the corresponding
+    /// regex, then it is replaced with an empty string.
     ///
     /// To write a literal `$` use `$$`.
     pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
diff --git a/src/re_unicode.rs b/src/re_unicode.rs
@@ -947,17 +947,22 @@ impl<'t> Captures<'t> {
     /// Expands all instances of `$name` in `replacement` to the corresponding
     /// capture group `name`, and writes them to the `dst` buffer given.
     ///
-    /// `name` may be an integer corresponding to the index of the
-    /// capture group (counted by order of opening parenthesis where `0` is the
+    /// `name` may be an integer corresponding to the index of the capture
+    /// group (counted by order of opening parenthesis where `0` is the
     /// entire match) or it can be a name (consisting of letters, digits or
     /// underscores) corresponding to a named capture group.
     ///
     /// If `name` isn't a valid capture group (whether the name doesn't exist
     /// or isn't a valid index), then it is replaced with the empty string.
     ///
-    /// The longest possible name is used. e.g., `$1a` looks up the capture
-    /// group named `1a` and not the capture group at index `1`. To exert more
-    /// precise control over the name, use braces, e.g., `${1}a`.
+    /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+    /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+    /// capture group at index `1`. To exert more precise control over the
+    /// name, or to refer to a capture group name that uses characters outside
+    /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+    /// using braces, any sequence of characters is permitted. If the sequence
+    /// does not refer to a capture group name in the corresponding regex, then
+    /// it is replaced with an empty string.
     ///
     /// To write a literal `$` use `$$`.
     pub fn expand(&self, replacement: &str, dst: &mut String) {
diff --git a/tests/api.rs b/tests/api.rs
@@ -195,6 +195,18 @@ expand!(
 );
 expand!(expand10, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$bz$az", "");
 
+expand!(expand_name1, r"%(?P<Z>[a-z]+)", "%abc", "$Z%", "abc%");
+expand!(expand_name2, r"\[(?P<Z>[a-z]+)", "[abc", "$Z[", "abc[");
+expand!(expand_name3, r"\{(?P<Z>[a-z]+)", "{abc", "$Z{", "abc{");
+expand!(expand_name4, r"\}(?P<Z>[a-z]+)", "}abc", "$Z}", "abc}");
+expand!(expand_name5, r"%([a-z]+)", "%abc", "$1a%", "%");
+expand!(expand_name6, r"%([a-z]+)", "%abc", "${1}a%", "abca%");
+expand!(expand_name7, r"\[(?P<Z[>[a-z]+)", "[abc", "${Z[}[", "abc[");
+expand!(expand_name8, r"\[(?P<Z[>[a-z]+)", "[abc", "${foo}[", "[");
+expand!(expand_name9, r"\[(?P<Z[>[a-z]+)", "[abc", "${1a}[", "[");
+expand!(expand_name10, r"\[(?P<Z[>[a-z]+)", "[abc", "${#}[", "[");
+expand!(expand_name11, r"\[(?P<Z[>[a-z]+)", "[abc", "${$$}[", "[");
+
 split!(
     split1,
     r"(?-u)\s+",