Skip to content

Commit 9d7000a

Browse files
committed
syntax: allow Unicode in capture names
This changes the rules for capture names to be much less restrictive. Namely, the requirements are now: 1. Must begin with an `_` or any alphabetic codepoint. 2. After the first codepoint, the name may contain any sequence of alpha-numeric codepoints along with `_`, `.`, `[` and `]`. Closes #595
1 parent 5d97dfa commit 9d7000a

File tree

3 files changed

+125
-8
lines changed

3 files changed

+125
-8
lines changed

regex-syntax/src/ast/parse.rs

+108-5
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,11 @@ fn is_hex(c: char) -> bool {
109109
/// If `first` is true, then `c` is treated as the first character in the
110110
/// group name (which must be alphabetic or underscore).
111111
fn is_capture_char(c: char, first: bool) -> bool {
112-
c == '_'
113-
|| (!first
114-
&& (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
115-
|| ('A' <= c && c <= 'Z')
116-
|| ('a' <= c && c <= 'z')
112+
if first {
113+
c == '_' || c.is_alphabetic()
114+
} else {
115+
c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
116+
}
117117
}
118118

119119
/// A builder for a regular expression parser.
@@ -3910,6 +3910,55 @@ bar
39103910
}))
39113911
);
39123912

3913+
assert_eq!(
3914+
parser("(?P<a¾>)").parse(),
3915+
Ok(Ast::Group(ast::Group {
3916+
span: Span::new(
3917+
Position::new(0, 1, 1),
3918+
Position::new(9, 1, 9),
3919+
),
3920+
kind: ast::GroupKind::CaptureName {
3921+
starts_with_p: true,
3922+
name: ast::CaptureName {
3923+
span: Span::new(
3924+
Position::new(4, 1, 5),
3925+
Position::new(7, 1, 7),
3926+
),
3927+
name: s("a¾"),
3928+
index: 1,
3929+
}
3930+
},
3931+
ast: Box::new(Ast::Empty(Span::new(
3932+
Position::new(8, 1, 8),
3933+
Position::new(8, 1, 8),
3934+
))),
3935+
}))
3936+
);
3937+
assert_eq!(
3938+
parser("(?P<名字>)").parse(),
3939+
Ok(Ast::Group(ast::Group {
3940+
span: Span::new(
3941+
Position::new(0, 1, 1),
3942+
Position::new(12, 1, 9),
3943+
),
3944+
kind: ast::GroupKind::CaptureName {
3945+
starts_with_p: true,
3946+
name: ast::CaptureName {
3947+
span: Span::new(
3948+
Position::new(4, 1, 5),
3949+
Position::new(10, 1, 7),
3950+
),
3951+
name: s("名字"),
3952+
index: 1,
3953+
}
3954+
},
3955+
ast: Box::new(Ast::Empty(Span::new(
3956+
Position::new(11, 1, 8),
3957+
Position::new(11, 1, 8),
3958+
))),
3959+
}))
3960+
);
3961+
39133962
assert_eq!(
39143963
parser("(?P<").parse().unwrap_err(),
39153964
TestError {
@@ -3968,6 +4017,60 @@ bar
39684017
},
39694018
}
39704019
);
4020+
assert_eq!(
4021+
parser("(?P<5>)").parse().unwrap_err(),
4022+
TestError {
4023+
span: span(4..5),
4024+
kind: ast::ErrorKind::GroupNameInvalid,
4025+
}
4026+
);
4027+
assert_eq!(
4028+
parser("(?P<5a>)").parse().unwrap_err(),
4029+
TestError {
4030+
span: span(4..5),
4031+
kind: ast::ErrorKind::GroupNameInvalid,
4032+
}
4033+
);
4034+
assert_eq!(
4035+
parser("(?P<¾>)").parse().unwrap_err(),
4036+
TestError {
4037+
span: Span::new(
4038+
Position::new(4, 1, 5),
4039+
Position::new(6, 1, 6),
4040+
),
4041+
kind: ast::ErrorKind::GroupNameInvalid,
4042+
}
4043+
);
4044+
assert_eq!(
4045+
parser("(?P<¾a>)").parse().unwrap_err(),
4046+
TestError {
4047+
span: Span::new(
4048+
Position::new(4, 1, 5),
4049+
Position::new(6, 1, 6),
4050+
),
4051+
kind: ast::ErrorKind::GroupNameInvalid,
4052+
}
4053+
);
4054+
assert_eq!(
4055+
parser("(?P<☃>)").parse().unwrap_err(),
4056+
TestError {
4057+
span: Span::new(
4058+
Position::new(4, 1, 5),
4059+
Position::new(7, 1, 6),
4060+
),
4061+
kind: ast::ErrorKind::GroupNameInvalid,
4062+
}
4063+
);
4064+
assert_eq!(
4065+
parser("(?P<a☃>)").parse().unwrap_err(),
4066+
TestError {
4067+
span: Span::new(
4068+
Position::new(5, 1, 6),
4069+
Position::new(8, 1, 7),
4070+
),
4071+
kind: ast::ErrorKind::GroupNameInvalid,
4072+
}
4073+
);
39714074
}
39724075

39734076
#[test]

src/expand.rs

+9-1
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
182182
})
183183
}
184184

185-
/// Returns true if and only if the given byte is allowed in a capture name.
185+
/// Returns true if and only if the given byte is allowed in a capture name
186+
/// written in non-brace form.
186187
fn is_valid_cap_letter(b: u8) -> bool {
187188
match b {
188189
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
@@ -236,4 +237,11 @@ mod tests {
236237
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
237238
find!(find_cap_ref18, "${#}", c!("#", 4));
238239
find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
240+
find!(find_cap_ref20, "${¾}", c!("¾", 5));
241+
find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
242+
find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
243+
find!(find_cap_ref23, "${☃}", c!("☃", 6));
244+
find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
245+
find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
246+
find!(find_cap_ref26, "${名字}", c!("名字", 9));
239247
}

src/lib.rs

+8-2
Original file line numberDiff line numberDiff line change
@@ -360,13 +360,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
360360
361361
<pre class="rust">
362362
(exp) numbered capture group (indexed by opening parenthesis)
363-
(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
364-
(?&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
363+
(?P&lt;name&gt;exp) named (also numbered) capture group (names must be alpha-numeric)
364+
(?&lt;name&gt;exp) named (also numbered) capture group (names must be alpha-numeric)
365365
(?:exp) non-capturing group
366366
(?flags) set flags within current group
367367
(?flags:exp) set flags for exp (non-capturing)
368368
</pre>
369369
370+
Capture group names must be any sequence of alpha-numeric Unicode codepoints,
371+
in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or
372+
an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic`
373+
Unicode property, while numeric codepoints correspond to the union of the
374+
`Decimal_Number`, `Letter_Number` and `Other_Number` general categories.
375+
370376
Flags are each a single character. For example, `(?x)` sets the flag `x`
371377
and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
372378
the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets

0 commit comments

Comments
 (0)