Skip to content

Commit bb233ec

Browse files
committed
Support nested character classes and intersection with &&
This implements parts of UTS#18 RL1.3, namely: * Nested character classes, e.g.: `[a[b-c]]` * Intersections in classes, e.g.: `[\w&&\p{Greek}]` They can be combined to do things like `[\w&&[^a]]` to get all word characters except `a`. Fixes #341
1 parent bedc221 commit bb233ec

File tree

4 files changed

+684
-85
lines changed

4 files changed

+684
-85
lines changed

Diff for: regex-syntax/src/lib.rs

+156-1
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,7 @@ impl CharClass {
680680
self.canonicalize()
681681
}
682682

683-
/// Canonicalze any sequence of ranges.
683+
/// Canonicalize any sequence of ranges.
684684
///
685685
/// This is responsible for enforcing the canonical format invariants
686686
/// as described on the docs for the `CharClass` type.
@@ -703,6 +703,41 @@ impl CharClass {
703703
ordered
704704
}
705705

706+
/// Calculate the intersection of two canonical character classes.
707+
///
708+
/// The returned intersection is canonical.
709+
fn intersection(&self, other: &CharClass) -> CharClass {
710+
if self.ranges.is_empty() || other.ranges.is_empty() {
711+
return CharClass::empty();
712+
}
713+
714+
let mut intersection = CharClass::empty();
715+
716+
let mut iter_a = self.ranges.iter();
717+
let mut iter_b = other.ranges.iter();
718+
let mut a = iter_a.next().unwrap();
719+
let mut b = iter_b.next().unwrap();
720+
loop {
721+
if let Some(i) = a.intersection(&b) {
722+
intersection.ranges.push(i);
723+
}
724+
725+
// If the range with the smaller end didn't match this time,
726+
// it won't ever match, so move on to the next one.
727+
let (iter, item) = if a.end < b.end {
728+
(&mut iter_a, &mut a)
729+
} else {
730+
(&mut iter_b, &mut b)
731+
};
732+
match iter.next() {
733+
Some(v) => *item = v,
734+
None => break, // no more ranges to check, done
735+
}
736+
}
737+
738+
intersection.canonicalize()
739+
}
740+
706741
/// Negates the character class.
707742
///
708743
/// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
@@ -801,6 +836,18 @@ impl ClassRange {
801836
max(self.start, other.start) <= inc_char(min(self.end, other.end))
802837
}
803838

839+
/// Returns the intersection of the two ranges if they have common
840+
/// characters, `None` otherwise.
841+
fn intersection(&self, other: &ClassRange) -> Option<ClassRange> {
842+
let start = max(self.start, other.start);
843+
let end = min(self.end, other.end);
844+
if start <= end {
845+
Some(ClassRange::new(start, end))
846+
} else {
847+
None
848+
}
849+
}
850+
804851
/// Creates a new range representing the union of `self` and `other.
805852
fn merge(self, other: ClassRange) -> ClassRange {
806853
ClassRange {
@@ -1907,6 +1954,108 @@ mod tests {
19071954
]));
19081955
}
19091956

1957+
#[test]
1958+
fn class_intersection_empty() {
1959+
let cls1 = class(&[]);
1960+
let cls2 = class(&[('a', 'a')]);
1961+
assert_intersection(cls1, cls2, class(&[]));
1962+
}
1963+
1964+
#[test]
1965+
fn class_intersection_single_equal() {
1966+
let cls1 = class(&[('a', 'a')]);
1967+
let cls2 = class(&[('a', 'a')]);
1968+
assert_intersection(cls1, cls2, class(&[('a', 'a')]));
1969+
}
1970+
1971+
#[test]
1972+
fn class_intersection_single_unequal() {
1973+
let cls1 = class(&[('a', 'a')]);
1974+
let cls2 = class(&[('b', 'b')]);
1975+
assert_intersection(cls1, cls2, class(&[]));
1976+
}
1977+
1978+
#[test]
1979+
fn class_intersection_single_in_other() {
1980+
let cls1 = class(&[('a', 'a')]);
1981+
let cls2 = class(&[('a', 'c')]);
1982+
assert_intersection(cls1, cls2, class(&[('a', 'a')]));
1983+
}
1984+
1985+
#[test]
1986+
fn class_intersection_range_in_other() {
1987+
let cls1 = class(&[('a', 'b')]);
1988+
let cls2 = class(&[('a', 'c')]);
1989+
assert_intersection(cls1, cls2, class(&[('a', 'b')]));
1990+
}
1991+
1992+
#[test]
1993+
fn class_intersection_range_intersection() {
1994+
let cls1 = class(&[('a', 'b')]);
1995+
let cls2 = class(&[('b', 'c')]);
1996+
assert_intersection(cls1, cls2, class(&[('b', 'b')]));
1997+
}
1998+
1999+
#[test]
2000+
fn class_intersection_only_adjacent() {
2001+
let cls1 = class(&[('a', 'b')]);
2002+
let cls2 = class(&[('c', 'd')]);
2003+
assert_intersection(cls1, cls2, class(&[]));
2004+
}
2005+
2006+
#[test]
2007+
fn class_intersection_range_subset() {
2008+
let cls1 = class(&[('b', 'c')]);
2009+
let cls2 = class(&[('a', 'd')]);
2010+
assert_intersection(cls1, cls2, class(&[('b', 'c')]));
2011+
}
2012+
2013+
#[test]
2014+
fn class_intersection_many_ranges_in_one_big() {
2015+
let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2016+
let cls2 = class(&[('a', 'h')]);
2017+
assert_intersection(cls1, cls2, class(&[
2018+
('a', 'b'), ('d', 'e'), ('g', 'h')
2019+
]));
2020+
}
2021+
2022+
#[test]
2023+
fn class_intersection_many_ranges_same() {
2024+
let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2025+
let cls2 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2026+
assert_intersection(cls1, cls2, class(&[
2027+
('a', 'b'), ('d', 'e'), ('g', 'h')
2028+
]));
2029+
}
2030+
2031+
#[test]
2032+
fn class_intersection_multiple_non_intersecting() {
2033+
let cls1 = class(&[('a', 'b'), ('g', 'h')]);
2034+
let cls2 = class(&[('d', 'e'), ('k', 'l')]);
2035+
assert_intersection(cls1, cls2, class(&[]));
2036+
}
2037+
2038+
#[test]
2039+
fn class_intersection_non_intersecting_then_intersecting() {
2040+
let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2041+
let cls2 = class(&[('h', 'h')]);
2042+
assert_intersection(cls1, cls2, class(&[('h', 'h')]));
2043+
}
2044+
2045+
#[test]
2046+
fn class_intersection_adjacent_alternating() {
2047+
let cls1 = class(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2048+
let cls2 = class(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2049+
assert_intersection(cls1, cls2, class(&[]));
2050+
}
2051+
2052+
#[test]
2053+
fn class_intersection_overlapping_alternating() {
2054+
let cls1 = class(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2055+
let cls2 = class(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2056+
assert_intersection(cls1, cls2, class(&[('b', 'f')]));
2057+
}
2058+
19102059
#[test]
19112060
fn class_canon_overlap_many_case_fold() {
19122061
let cls = class(&[
@@ -2056,4 +2205,10 @@ mod tests {
20562205
let expr = e("(?-u)[-./]");
20572206
assert_eq!("(?-u:[-\\.-/])", expr.to_string());
20582207
}
2208+
2209+
fn assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass) {
2210+
// intersection operation should be commutative
2211+
assert_eq!(cls1.intersection(&cls2), expected);
2212+
assert_eq!(cls2.intersection(&cls1), expected);
2213+
}
20592214
}

0 commit comments

Comments
 (0)