Skip to content

Commit 68d5acb

Browse files
committed
syntax/ast: add support for additional word boundary assertions
This adds AST support for the following new assertions: \b{start}, \b{end}, \b{start-half}, \b{end-half}, \< and \>. The last two, \< and \>, are aliases for \b{start} and \b{end}. The parsing for this is a little suspect since there's a little ambiguity between, e.g., \b{5} and \b{start}, but we handle it by allowing the parser to look for one of the new special assertions, and then back-up if it fails to find one so that it can try to parse a counted repetition. Ref #469
1 parent 9d5390f commit 68d5acb

File tree

5 files changed

+281
-15
lines changed

5 files changed

+281
-15
lines changed

regex-syntax/src/ast/mod.rs

+47
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,18 @@ pub enum ErrorKind {
162162
/// `(?i)*`. It is, however, possible to create a repetition operating on
163163
/// an empty sub-expression. For example, `()*` is still considered valid.
164164
RepetitionMissing,
165+
/// The special word boundary syntax, `\b{something}`, was used, but
166+
/// either EOF without `}` was seen, or an invalid character in the
167+
/// braces was seen.
168+
SpecialWordBoundaryUnclosed,
169+
/// The special word boundary syntax, `\b{something}`, was used, but
170+
/// `something` was not recognized as a valid word boundary kind.
171+
SpecialWordBoundaryUnrecognized,
172+
/// The syntax `\b{` was observed, but afterwards the end of the pattern
173+
/// was observed without being able to tell whether it was meant to be a
174+
/// bounded repetition on the `\b` or the beginning of a special word
175+
/// boundary assertion.
176+
SpecialWordOrRepetitionUnexpectedEof,
165177
/// The Unicode class is not valid. This typically occurs when a `\p` is
166178
/// followed by something other than a `{`.
167179
UnicodeClassInvalid,
@@ -260,6 +272,29 @@ impl core::fmt::Display for ErrorKind {
260272
RepetitionMissing => {
261273
write!(f, "repetition operator missing expression")
262274
}
275+
SpecialWordBoundaryUnclosed => {
276+
write!(
277+
f,
278+
"special word boundary assertion is either \
279+
unclosed or contains an invalid character",
280+
)
281+
}
282+
SpecialWordBoundaryUnrecognized => {
283+
write!(
284+
f,
285+
"unrecognized special word boundary assertion, \
286+
valid choices are: start, end, start-half \
287+
or end-half",
288+
)
289+
}
290+
SpecialWordOrRepetitionUnexpectedEof => {
291+
write!(
292+
f,
293+
"found either the beginning of a special word \
294+
boundary or a bounded repetition on a \\b with \
295+
an opening brace, but no closing brace",
296+
)
297+
}
263298
UnicodeClassInvalid => {
264299
write!(f, "invalid Unicode character class")
265300
}
@@ -1293,6 +1328,18 @@ pub enum AssertionKind {
12931328
WordBoundary,
12941329
/// `\B`
12951330
NotWordBoundary,
1331+
/// `\b{start}`
1332+
WordBoundaryStart,
1333+
/// `\b{end}`
1334+
WordBoundaryEnd,
1335+
/// `\<` (alias for `\b{start}`)
1336+
WordBoundaryStartAngle,
1337+
/// `\>` (alias for `\b{end}`)
1338+
WordBoundaryEndAngle,
1339+
/// `\b{start-half}`
1340+
WordBoundaryStartHalf,
1341+
/// `\b{end-half}`
1342+
WordBoundaryEndHalf,
12961343
}
12971344

12981345
/// A repetition operation applied to a regular expression.

regex-syntax/src/ast/parse.rs

+211-15
Original file line numberDiff line numberDiff line change
@@ -1528,18 +1528,115 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
15281528
span,
15291529
kind: ast::AssertionKind::EndText,
15301530
})),
1531-
'b' => Ok(Primitive::Assertion(ast::Assertion {
1532-
span,
1533-
kind: ast::AssertionKind::WordBoundary,
1534-
})),
1531+
'b' => {
1532+
let mut wb = ast::Assertion {
1533+
span,
1534+
kind: ast::AssertionKind::WordBoundary,
1535+
};
1536+
// After a \b, we "try" to parse things like \b{start} for
1537+
// special word boundary assertions.
1538+
if !self.is_eof() && self.char() == '{' {
1539+
if let Some(kind) =
1540+
self.maybe_parse_special_word_boundary(start)?
1541+
{
1542+
wb.kind = kind;
1543+
wb.span.end = self.pos();
1544+
}
1545+
}
1546+
Ok(Primitive::Assertion(wb))
1547+
}
15351548
'B' => Ok(Primitive::Assertion(ast::Assertion {
15361549
span,
15371550
kind: ast::AssertionKind::NotWordBoundary,
15381551
})),
1552+
'<' => Ok(Primitive::Assertion(ast::Assertion {
1553+
span,
1554+
kind: ast::AssertionKind::WordBoundaryStartAngle,
1555+
})),
1556+
'>' => Ok(Primitive::Assertion(ast::Assertion {
1557+
span,
1558+
kind: ast::AssertionKind::WordBoundaryEndAngle,
1559+
})),
15391560
_ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
15401561
}
15411562
}
15421563

1564+
/// Attempt to parse a specialty word boundary. That is, `\b{start}`,
1565+
/// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
1566+
///
1567+
/// This is similar to `maybe_parse_ascii_class` in that, in most cases,
1568+
/// if it fails it will just return `None` with no error. This is done
1569+
/// because `\b{5}` is a valid expression and we want to let that be parsed
1570+
/// by the existing counted repetition parsing code. (I thought about just
1571+
/// invoking the counted repetition code from here, but it seemed a little
1572+
/// ham-fisted.)
1573+
///
1574+
/// Unlike `maybe_parse_ascii_class` though, this can return an error.
1575+
/// Namely, if we definitely know it isn't a counted repetition, then we
1576+
/// return an error specific to the specialty word boundaries.
1577+
///
1578+
/// This assumes the parser is positioned at a `{` immediately following
1579+
/// a `\b`. When `None` is returned, the parser is returned to the position
1580+
/// at which it started: pointing at a `{`.
1581+
///
1582+
/// The position given should correspond to the start of the `\b`.
1583+
fn maybe_parse_special_word_boundary(
1584+
&self,
1585+
wb_start: Position,
1586+
) -> Result<Option<ast::AssertionKind>> {
1587+
assert_eq!(self.char(), '{');
1588+
1589+
let is_valid_char = |c| match c {
1590+
'A'..='Z' | 'a'..='z' | '-' => true,
1591+
_ => false,
1592+
};
1593+
let start = self.pos();
1594+
if !self.bump_and_bump_space() {
1595+
return Err(self.error(
1596+
Span::new(wb_start, self.pos()),
1597+
ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
1598+
));
1599+
}
1600+
let start_contents = self.pos();
1601+
// This is one of the critical bits: if the first non-whitespace
1602+
// character isn't in [-A-Za-z] (i.e., this can't be a special word
1603+
// boundary), then we bail and let the counted repetition parser deal
1604+
// with this.
1605+
if !is_valid_char(self.char()) {
1606+
self.parser().pos.set(start);
1607+
return Ok(None);
1608+
}
1609+
1610+
// Now collect up our chars until we see a '}'.
1611+
let mut scratch = self.parser().scratch.borrow_mut();
1612+
scratch.clear();
1613+
while !self.is_eof() && is_valid_char(self.char()) {
1614+
scratch.push(self.char());
1615+
self.bump_and_bump_space();
1616+
}
1617+
if self.is_eof() || self.char() != '}' {
1618+
return Err(self.error(
1619+
Span::new(start, self.pos()),
1620+
ast::ErrorKind::SpecialWordBoundaryUnclosed,
1621+
));
1622+
}
1623+
let end = self.pos();
1624+
self.bump();
1625+
let kind = match scratch.as_str() {
1626+
"start" => ast::AssertionKind::WordBoundaryStart,
1627+
"end" => ast::AssertionKind::WordBoundaryEnd,
1628+
"start-half" => ast::AssertionKind::WordBoundaryStartHalf,
1629+
"end-half" => ast::AssertionKind::WordBoundaryEndHalf,
1630+
_ => {
1631+
return Err(self.error(
1632+
Span::new(start_contents, end),
1633+
ast::ErrorKind::SpecialWordBoundaryUnrecognized,
1634+
))
1635+
}
1636+
};
1637+
Ok(Some(kind))
1638+
}
1639+
15431640
/// Parse an octal representation of a Unicode codepoint up to 3 digits
15441641
/// long. This expects the parser to be positioned at the first octal
15451642
/// digit and advances the parser to the first character immediately
@@ -1967,9 +2064,9 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
19672064
// because parsing cannot fail with any interesting error. For example,
19682065
// in order to use an ASCII character class, it must be enclosed in
19692066
// double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1970-
// of it as "ASCII character characters have the syntax `[:NAME:]`
1971-
// which can only appear within character brackets." This means that
1972-
// things like `[[:lower:]A]` are legal constructs.
2067+
// of it as "ASCII character classes have the syntax `[:NAME:]` which
2068+
// can only appear within character brackets." This means that things
2069+
// like `[[:lower:]A]` are legal constructs.
19732070
//
19742071
// However, if one types an incorrect ASCII character class, e.g.,
19752072
// `[[:loower:]]`, then we treat that as a normal nested character
@@ -3295,6 +3392,23 @@ bar
32953392
ast: Box::new(lit('a', 0)),
32963393
}))
32973394
);
3395+
assert_eq!(
3396+
parser(r"\b{5,9}").parse(),
3397+
Ok(Ast::repetition(ast::Repetition {
3398+
span: span(0..7),
3399+
op: ast::RepetitionOp {
3400+
span: span(2..7),
3401+
kind: ast::RepetitionKind::Range(
3402+
ast::RepetitionRange::Bounded(5, 9)
3403+
),
3404+
},
3405+
greedy: true,
3406+
ast: Box::new(Ast::assertion(ast::Assertion {
3407+
span: span(0..2),
3408+
kind: ast::AssertionKind::WordBoundary,
3409+
})),
3410+
}))
3411+
);
32983412

32993413
assert_eq!(
33003414
parser(r"(?i){0}").parse().unwrap_err(),
@@ -4381,6 +4495,48 @@ bar
43814495
kind: ast::AssertionKind::WordBoundary,
43824496
}))
43834497
);
4498+
assert_eq!(
4499+
parser(r"\b{start}").parse_primitive(),
4500+
Ok(Primitive::Assertion(ast::Assertion {
4501+
span: span(0..9),
4502+
kind: ast::AssertionKind::WordBoundaryStart,
4503+
}))
4504+
);
4505+
assert_eq!(
4506+
parser(r"\b{end}").parse_primitive(),
4507+
Ok(Primitive::Assertion(ast::Assertion {
4508+
span: span(0..7),
4509+
kind: ast::AssertionKind::WordBoundaryEnd,
4510+
}))
4511+
);
4512+
assert_eq!(
4513+
parser(r"\b{start-half}").parse_primitive(),
4514+
Ok(Primitive::Assertion(ast::Assertion {
4515+
span: span(0..14),
4516+
kind: ast::AssertionKind::WordBoundaryStartHalf,
4517+
}))
4518+
);
4519+
assert_eq!(
4520+
parser(r"\b{end-half}").parse_primitive(),
4521+
Ok(Primitive::Assertion(ast::Assertion {
4522+
span: span(0..12),
4523+
kind: ast::AssertionKind::WordBoundaryEndHalf,
4524+
}))
4525+
);
4526+
assert_eq!(
4527+
parser(r"\<").parse_primitive(),
4528+
Ok(Primitive::Assertion(ast::Assertion {
4529+
span: span(0..2),
4530+
kind: ast::AssertionKind::WordBoundaryStartAngle,
4531+
}))
4532+
);
4533+
assert_eq!(
4534+
parser(r"\>").parse_primitive(),
4535+
Ok(Primitive::Assertion(ast::Assertion {
4536+
span: span(0..2),
4537+
kind: ast::AssertionKind::WordBoundaryEndAngle,
4538+
}))
4539+
);
43844540
assert_eq!(
43854541
parser(r"\B").parse_primitive(),
43864542
Ok(Primitive::Assertion(ast::Assertion {
@@ -4418,20 +4574,60 @@ bar
44184574
kind: ast::ErrorKind::EscapeUnrecognized,
44194575
}
44204576
);
4421-
// But also, < and > are banned, so that we may evolve them into
4422-
// start/end word boundary assertions. (Not sure if we will...)
4577+
4578+
// Starting a special word boundary without any non-whitespace chars
4579+
// after the brace makes it ambiguous whether the user meant to write
4580+
// a counted repetition (probably not?) or an actual special word
4581+
// boundary assertion.
44234582
assert_eq!(
4424-
parser(r"\<").parse_escape().unwrap_err(),
4583+
parser(r"\b{").parse_escape().unwrap_err(),
44254584
TestError {
4426-
span: span(0..2),
4427-
kind: ast::ErrorKind::EscapeUnrecognized,
4585+
span: span(0..3),
4586+
kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
44284587
}
44294588
);
44304589
assert_eq!(
4431-
parser(r"\>").parse_escape().unwrap_err(),
4590+
parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(),
44324591
TestError {
4433-
span: span(0..2),
4434-
kind: ast::ErrorKind::EscapeUnrecognized,
4592+
span: span(0..4),
4593+
kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
4594+
}
4595+
);
4596+
// When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char,
4597+
// and thus causes the parser to treat it as a counted repetition.
4598+
assert_eq!(
4599+
parser(r"\b{ ").parse().unwrap_err(),
4600+
TestError {
4601+
span: span(4..4),
4602+
kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
4603+
}
4604+
);
4605+
// In this case, we got some valid chars that makes it look like the
4606+
// user is writing one of the special word boundary assertions, but
4607+
// we forget to close the brace.
4608+
assert_eq!(
4609+
parser(r"\b{foo").parse_escape().unwrap_err(),
4610+
TestError {
4611+
span: span(2..6),
4612+
kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
4613+
}
4614+
);
4615+
// We get the same error as above, except it is provoked by seeing a
4616+
// char that we know is invalid before seeing a closing brace.
4617+
assert_eq!(
4618+
parser(r"\b{foo!}").parse_escape().unwrap_err(),
4619+
TestError {
4620+
span: span(2..6),
4621+
kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
4622+
}
4623+
);
4624+
// And this one occurs when, syntactically, everything looks okay, but
4625+
// we don't use a valid spelling of a word boundary assertion.
4626+
assert_eq!(
4627+
parser(r"\b{foo}").parse_escape().unwrap_err(),
4628+
TestError {
4629+
span: span(3..6),
4630+
kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized,
44354631
}
44364632
);
44374633

regex-syntax/src/ast/print.rs

+6
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,12 @@ impl<W: fmt::Write> Writer<W> {
261261
EndText => self.wtr.write_str(r"\z"),
262262
WordBoundary => self.wtr.write_str(r"\b"),
263263
NotWordBoundary => self.wtr.write_str(r"\B"),
264+
WordBoundaryStart => self.wtr.write_str(r"\b{start}"),
265+
WordBoundaryEnd => self.wtr.write_str(r"\b{end}"),
266+
WordBoundaryStartAngle => self.wtr.write_str(r"\<"),
267+
WordBoundaryEndAngle => self.wtr.write_str(r"\>"),
268+
WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"),
269+
WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"),
264270
}
265271
}
266272

regex-syntax/src/hir/translate.rs

+14
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,20 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
962962
} else {
963963
hir::Look::WordAsciiNegate
964964
}),
965+
ast::AssertionKind::WordBoundaryStart
966+
| ast::AssertionKind::WordBoundaryStartAngle => {
967+
Hir::look(if unicode { todo!() } else { todo!() })
968+
}
969+
ast::AssertionKind::WordBoundaryEnd
970+
| ast::AssertionKind::WordBoundaryEndAngle => {
971+
Hir::look(if unicode { todo!() } else { todo!() })
972+
}
973+
ast::AssertionKind::WordBoundaryStartHalf => {
974+
Hir::look(if unicode { todo!() } else { todo!() })
975+
}
976+
ast::AssertionKind::WordBoundaryEndHalf => {
977+
Hir::look(if unicode { todo!() } else { todo!() })
978+
}
965979
})
966980
}
967981

regex-syntax/src/lib.rs

+3
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,9 @@ pub fn is_escapeable_character(c: char) -> bool {
334334
// escapeable, \< and \> will result in a parse error. Thus, we can
335335
// turn them into something else in the future without it being a
336336
// backwards incompatible change.
337+
//
338+
// OK, now we support \< and \>, and we need to retain them as *not*
339+
// escapeable here since the escape sequence is significant.
337340
'<' | '>' => false,
338341
_ => true,
339342
}

0 commit comments

Comments
 (0)