Skip to content

Commit 37faa6e

Browse files
committed
syntax/hir: add new special word boundaries to HIR
This builds on the previous commit to bring word boundary support to the HIR, and updates AST->HIR translation to produce them from the corresponding AST elements. Ref #469
1 parent 8f77e22 commit 37faa6e

File tree

3 files changed

+126
-19
lines changed

3 files changed

+126
-19
lines changed

regex-syntax/src/hir/mod.rs

+82-13
Original file line numberDiff line numberDiff line change
@@ -1635,6 +1635,42 @@ pub enum Look {
16351635
WordUnicode = 1 << 8,
16361636
/// Match a Unicode-aware negation of a word boundary.
16371637
WordUnicodeNegate = 1 << 9,
1638+
/// Match the start of an ASCII-only word boundary. That is, this matches a
1639+
/// position at either the beginning of the haystack or where the previous
1640+
/// character is not a word character and the following character is a word
1641+
/// character.
1642+
WordStartAscii = 1 << 10,
1643+
/// Match the end of an ASCII-only word boundary. That is, this matches
1644+
/// a position at either the end of the haystack or where the previous
1645+
/// character is a word character and the following character is not a word
1646+
/// character.
1647+
WordEndAscii = 1 << 11,
1648+
/// Match the start of a Unicode word boundary. That is, this matches a
1649+
/// position at either the beginning of the haystack or where the previous
1650+
/// character is not a word character and the following character is a word
1651+
/// character.
1652+
WordStartUnicode = 1 << 12,
1653+
/// Match the end of a Unicode word boundary. That is, this matches a
1654+
/// position at either the end of the haystack or where the previous
1655+
/// character is a word character and the following character is not a word
1656+
/// character.
1657+
WordEndUnicode = 1 << 13,
1658+
/// Match the start half of an ASCII-only word boundary. That is, this
1659+
/// matches a position at either the beginning of the haystack or where the
1660+
/// previous character is not a word character.
1661+
WordStartHalfAscii = 1 << 14,
1662+
/// Match the end half of an ASCII-only word boundary. That is, this
1663+
/// matches a position at either the end of the haystack or where the
1664+
/// following character is not a word character.
1665+
WordEndHalfAscii = 1 << 15,
1666+
/// Match the start half of a Unicode word boundary. That is, this matches
1667+
/// a position at either the beginning of the haystack or where the
1668+
/// previous character is not a word character.
1669+
WordStartHalfUnicode = 1 << 16,
1670+
/// Match the end half of a Unicode word boundary. That is, this matches
1671+
/// a position at either the end of the haystack or where the following
1672+
/// character is not a word character.
1673+
WordEndHalfUnicode = 1 << 17,
16381674
}
16391675

16401676
impl Look {
@@ -1656,6 +1692,14 @@ impl Look {
16561692
Look::WordAsciiNegate => Look::WordAsciiNegate,
16571693
Look::WordUnicode => Look::WordUnicode,
16581694
Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1695+
Look::WordStartAscii => Look::WordEndAscii,
1696+
Look::WordEndAscii => Look::WordStartAscii,
1697+
Look::WordStartUnicode => Look::WordEndUnicode,
1698+
Look::WordEndUnicode => Look::WordStartUnicode,
1699+
Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1700+
Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1701+
Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1702+
Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
16591703
}
16601704
}
16611705

@@ -1676,16 +1720,24 @@ impl Look {
16761720
#[inline]
16771721
pub const fn from_repr(repr: u32) -> Option<Look> {
16781722
match repr {
1679-
0b00_0000_0001 => Some(Look::Start),
1680-
0b00_0000_0010 => Some(Look::End),
1681-
0b00_0000_0100 => Some(Look::StartLF),
1682-
0b00_0000_1000 => Some(Look::EndLF),
1683-
0b00_0001_0000 => Some(Look::StartCRLF),
1684-
0b00_0010_0000 => Some(Look::EndCRLF),
1685-
0b00_0100_0000 => Some(Look::WordAscii),
1686-
0b00_1000_0000 => Some(Look::WordAsciiNegate),
1687-
0b01_0000_0000 => Some(Look::WordUnicode),
1688-
0b10_0000_0000 => Some(Look::WordUnicodeNegate),
1723+
0b00_0000_0000_0000_0001 => Some(Look::Start),
1724+
0b00_0000_0000_0000_0010 => Some(Look::End),
1725+
0b00_0000_0000_0000_0100 => Some(Look::StartLF),
1726+
0b00_0000_0000_0000_1000 => Some(Look::EndLF),
1727+
0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
1728+
0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
1729+
0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
1730+
0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
1731+
0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
1732+
0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
1733+
0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
1734+
0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
1735+
0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
1736+
0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
1737+
0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
1738+
0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
1739+
0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
1740+
0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
16891741
_ => None,
16901742
}
16911743
}
@@ -1710,6 +1762,14 @@ impl Look {
17101762
Look::WordAsciiNegate => 'B',
17111763
Look::WordUnicode => '𝛃',
17121764
Look::WordUnicodeNegate => '𝚩',
1765+
Look::WordStartAscii => '<',
1766+
Look::WordEndAscii => '>',
1767+
Look::WordStartUnicode => '〈',
1768+
Look::WordEndUnicode => '〉',
1769+
Look::WordStartHalfAscii => '◁',
1770+
Look::WordEndHalfAscii => '▷',
1771+
Look::WordStartHalfUnicode => '◀',
1772+
Look::WordEndHalfUnicode => '▶',
17131773
}
17141774
}
17151775
}
@@ -2703,13 +2763,22 @@ impl LookSet {
27032763
pub fn contains_word_unicode(self) -> bool {
27042764
self.contains(Look::WordUnicode)
27052765
|| self.contains(Look::WordUnicodeNegate)
2766+
|| self.contains(Look::WordStartUnicode)
2767+
|| self.contains(Look::WordEndUnicode)
2768+
|| self.contains(Look::WordStartHalfUnicode)
2769+
|| self.contains(Look::WordEndHalfUnicode)
27062770
}
27072771

27082772
/// Returns true if and only if this set contains any ASCII word boundary
27092773
/// or negated ASCII word boundary assertions.
27102774
#[inline]
27112775
pub fn contains_word_ascii(self) -> bool {
2712-
self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
2776+
self.contains(Look::WordAscii)
2777+
|| self.contains(Look::WordAsciiNegate)
2778+
|| self.contains(Look::WordStartAscii)
2779+
|| self.contains(Look::WordEndAscii)
2780+
|| self.contains(Look::WordStartHalfAscii)
2781+
|| self.contains(Look::WordEndHalfAscii)
27132782
}
27142783

27152784
/// Returns an iterator over all of the look-around assertions in this set.
@@ -3769,7 +3838,7 @@ mod tests {
37693838
assert_eq!(0, set.iter().count());
37703839

37713840
let set = LookSet::full();
3772-
assert_eq!(10, set.iter().count());
3841+
assert_eq!(18, set.iter().count());
37733842

37743843
let set =
37753844
LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
@@ -3787,6 +3856,6 @@ mod tests {
37873856
let res = format!("{:?}", LookSet::empty());
37883857
assert_eq!("∅", res);
37893858
let res = format!("{:?}", LookSet::full());
3790-
assert_eq!("Az^$rRbB𝛃𝚩", res);
3859+
assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
37913860
}
37923861
}

regex-syntax/src/hir/print.rs

+24
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,30 @@ impl<W: fmt::Write> Visitor for Writer<W> {
202202
hir::Look::WordUnicodeNegate => {
203203
self.wtr.write_str(r"\B")?;
204204
}
205+
hir::Look::WordStartAscii => {
206+
self.wtr.write_str(r"(?-u:\b{start})")?;
207+
}
208+
hir::Look::WordEndAscii => {
209+
self.wtr.write_str(r"(?-u:\b{end})")?;
210+
}
211+
hir::Look::WordStartUnicode => {
212+
self.wtr.write_str(r"\b{start}")?;
213+
}
214+
hir::Look::WordEndUnicode => {
215+
self.wtr.write_str(r"\b{end}")?;
216+
}
217+
hir::Look::WordStartHalfAscii => {
218+
self.wtr.write_str(r"(?-u:\b{start-half})")?;
219+
}
220+
hir::Look::WordEndHalfAscii => {
221+
self.wtr.write_str(r"(?-u:\b{end-half})")?;
222+
}
223+
hir::Look::WordStartHalfUnicode => {
224+
self.wtr.write_str(r"\b{start-half}")?;
225+
}
226+
hir::Look::WordEndHalfUnicode => {
227+
self.wtr.write_str(r"\b{end-half}")?;
228+
}
205229
},
206230
HirKind::Capture(hir::Capture { ref name, .. }) => {
207231
self.wtr.write_str("(")?;

regex-syntax/src/hir/translate.rs

+20-6
Original file line numberDiff line numberDiff line change
@@ -964,18 +964,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
964964
}),
965965
ast::AssertionKind::WordBoundaryStart
966966
| ast::AssertionKind::WordBoundaryStartAngle => {
967-
Hir::look(if unicode { todo!() } else { todo!() })
967+
Hir::look(if unicode {
968+
hir::Look::WordStartUnicode
969+
} else {
970+
hir::Look::WordStartAscii
971+
})
968972
}
969973
ast::AssertionKind::WordBoundaryEnd
970974
| ast::AssertionKind::WordBoundaryEndAngle => {
971-
Hir::look(if unicode { todo!() } else { todo!() })
975+
Hir::look(if unicode {
976+
hir::Look::WordEndUnicode
977+
} else {
978+
hir::Look::WordEndAscii
979+
})
972980
}
973981
ast::AssertionKind::WordBoundaryStartHalf => {
974-
Hir::look(if unicode { todo!() } else { todo!() })
975-
}
976-
ast::AssertionKind::WordBoundaryEndHalf => {
977-
Hir::look(if unicode { todo!() } else { todo!() })
982+
Hir::look(if unicode {
983+
hir::Look::WordStartHalfUnicode
984+
} else {
985+
hir::Look::WordStartHalfAscii
986+
})
978987
}
988+
ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
989+
hir::Look::WordEndHalfUnicode
990+
} else {
991+
hir::Look::WordEndHalfAscii
992+
}),
979993
})
980994
}
981995

0 commit comments

Comments
 (0)