Skip to content

Commit 94d0ad4

Browse files
committed
Add regex sets.
Regex sets permit matching multiple (possibly overlapping) regular expressions in a single scan of the search text. This adds a few new types, with `RegexSet` being the primary one. All matching engines support regex sets, including the lazy DFA. This commit also refactors a lot of the code around handling captures into a central `Search`, which now also includes a set of matches that is used by regex sets to determine which regex has matched. We also merged the `Program` and `Insts` type, which were split up when adding the lazy DFA, but the code seemed more complicated because of it. Closes #156.
1 parent 640bfa7 commit 94d0ad4

21 files changed

+1207
-731
lines changed

HACKING.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ the NFA algorithm, because it was one fewer epsilon transition that it had to
112112
follow.
113113

114114
There exist more instructions and they are defined and documented in
115-
src/inst.rs.
115+
src/prog.rs.
116116

117117
Compilation has several knobs and a few unfortunately complicated invariants.
118118
Namely, the output of compilation can be one of two types of programs: a
@@ -163,7 +163,7 @@ engine (or engines) to use.
163163

164164
The logic for choosing which engine to execute is in src/exec.rs and is
165165
documented on the Exec type. Exec values collection regular expression
166-
Programs (defined in src/program.rs), which contain all the necessary tidbits
166+
Programs (defined in src/prog.rs), which contain all the necessary tidbits
167167
for actually executing a regular expression on search text.
168168

169169
For the most part, the execution logic is straight-forward and follows the

README.md

+28
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,34 @@ fn some_helper_function(text: &str) -> bool {
128128
Specifically, in this example, the regex will be compiled when it is used for
129129
the first time. On subsequent uses, it will reuse the previous compilation.
130130

131+
### Usage: match multiple regular expressions simultaneously
132+
133+
This demonstrates how to use a `RegexSet` to match multiple (possibly
134+
overlapping) regular expressions in a single scan of the search text:
135+
136+
```rust
137+
use regex::RegexSet;
138+
139+
let set = RegexSet::new(&[
140+
r"\w+",
141+
r"\d+",
142+
r"\pL+",
143+
r"foo",
144+
r"bar",
145+
r"barfoo",
146+
r"foobar",
147+
]).unwrap();
148+
149+
// Iterate over and collect all of the matches.
150+
let matches: Vec<_> = set.matches("foobar").into_iter().collect();
151+
assert_eq!(matches, vec![0, 2, 3, 4, 6]);
152+
153+
// You can also test whether a particular regex matched:
154+
let matches = set.matches("foobar");
155+
assert!(!matches.matched(5));
156+
assert!(matches.matched(6));
157+
```
158+
131159
### Usage: `regex!` compiler plugin
132160

133161
The `regex!` compiler plugin will compile your regexes at compile time. **This

benches/bench_dynamic_compile.rs

+14-13
Original file line numberDiff line numberDiff line change
@@ -8,54 +8,55 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11+
use regex_syntax::Expr;
1112
use test::Bencher;
1213

13-
use regex::internal::ProgramBuilder;
14+
use regex::internal::Compiler;
1415

1516
#[bench]
1617
fn compile_simple(b: &mut Bencher) {
1718
b.iter(|| {
18-
let re = r"^bc(d|e)*$";
19-
ProgramBuilder::new(&re).compile().unwrap()
19+
let re = Expr::parse(r"^bc(d|e)*$").unwrap();
20+
Compiler::new().compile(&[re]).unwrap()
2021
});
2122
}
2223

2324
#[bench]
2425
fn compile_simple_bytes(b: &mut Bencher) {
2526
b.iter(|| {
26-
let re = r"^bc(d|e)*$";
27-
ProgramBuilder::new(&re).bytes(true).compile().unwrap()
27+
let re = Expr::parse(r"^bc(d|e)*$").unwrap();
28+
Compiler::new().bytes(true).compile(&[re]).unwrap()
2829
});
2930
}
3031

3132
#[bench]
3233
fn compile_small(b: &mut Bencher) {
3334
b.iter(|| {
34-
let re = r"\p{L}|\p{N}|\s|.|\d";
35-
ProgramBuilder::new(&re).compile().unwrap()
35+
let re = Expr::parse(r"\p{L}|\p{N}|\s|.|\d").unwrap();
36+
Compiler::new().compile(&[re]).unwrap()
3637
});
3738
}
3839

3940
#[bench]
4041
fn compile_small_bytes(b: &mut Bencher) {
4142
b.iter(|| {
42-
let re = r"\p{L}|\p{N}|\s|.|\d";
43-
ProgramBuilder::new(&re).bytes(true).compile().unwrap()
43+
let re = Expr::parse(r"\p{L}|\p{N}|\s|.|\d").unwrap();
44+
Compiler::new().bytes(true).compile(&[re]).unwrap()
4445
});
4546
}
4647

4748
#[bench]
4849
fn compile_huge(b: &mut Bencher) {
4950
b.iter(|| {
50-
let re = r"\p{L}{100}";
51-
ProgramBuilder::new(&re).compile().unwrap()
51+
let re = Expr::parse(r"\p{L}{100}").unwrap();
52+
Compiler::new().compile(&[re]).unwrap()
5253
});
5354
}
5455

5556
#[bench]
5657
fn compile_huge_bytes(b: &mut Bencher) {
5758
b.iter(|| {
58-
let re = r"\p{L}{100}";
59-
ProgramBuilder::new(&re).bytes(true).compile().unwrap()
59+
let re = Expr::parse(r"\p{L}{100}").unwrap();
60+
Compiler::new().bytes(true).compile(&[re]).unwrap()
6061
});
6162
}

examples/set.rs

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
extern crate regex;
2+
3+
use regex::RegexSet;
4+
5+
fn main() {
6+
let res = &[
7+
"abc",
8+
"xyzz",
9+
"^[ga-fh-z]+$",
10+
];
11+
let text = "abcggggggggxyz";
12+
let set = RegexSet::new(res).unwrap();
13+
println!("{:?}", set);
14+
let m = set.is_match("abcggggggggxyz");
15+
println!("match? {:?}", m);
16+
for mi in set.matches(text) {
17+
println!("{:?}", mi);
18+
}
19+
}

regex-syntax/src/lib.rs

+19-8
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,19 @@ pub enum Repeater {
177177
},
178178
}
179179

180+
impl Repeater {
181+
/// Returns true if and only if this repetition can match the empty string.
182+
fn matches_empty(&self) -> bool {
183+
use self::Repeater::*;
184+
match *self {
185+
ZeroOrOne => true,
186+
ZeroOrMore => true,
187+
OneOrMore => false,
188+
Range { min, .. } => min == 0,
189+
}
190+
}
191+
}
192+
180193
/// A character class.
181194
///
182195
/// A character class has a canonical format that the parser guarantees. Its
@@ -315,7 +328,9 @@ impl Expr {
315328
/// the beginning of text.
316329
pub fn is_anchored_start(&self) -> bool {
317330
match *self {
318-
Repeat { ref e, .. } => e.is_anchored_start(),
331+
Repeat { ref e, r, .. } => {
332+
!r.matches_empty() && e.is_anchored_start()
333+
}
319334
Group { ref e, .. } => e.is_anchored_start(),
320335
Concat(ref es) => es[0].is_anchored_start(),
321336
Alternate(ref es) => es.iter().all(|e| e.is_anchored_start()),
@@ -328,7 +343,9 @@ impl Expr {
328343
/// end of the text.
329344
pub fn is_anchored_end(&self) -> bool {
330345
match *self {
331-
Repeat { ref e, .. } => e.is_anchored_end(),
346+
Repeat { ref e, r, .. } => {
347+
!r.matches_empty() && e.is_anchored_end()
348+
}
332349
Group { ref e, .. } => e.is_anchored_end(),
333350
Concat(ref es) => es[es.len() - 1].is_anchored_end(),
334351
Alternate(ref es) => es.iter().all(|e| e.is_anchored_end()),
@@ -1059,9 +1076,6 @@ mod tests {
10591076
assert!(e("^a|^b").is_anchored_start());
10601077
assert!(e("(^a)|(^b)").is_anchored_start());
10611078
assert!(e("(^(a|b))").is_anchored_start());
1062-
assert!(e("^*").is_anchored_start());
1063-
assert!(e("(^)*").is_anchored_start());
1064-
assert!(e("((^)*)*").is_anchored_start());
10651079

10661080
assert!(!e("^a|b").is_anchored_start());
10671081
assert!(!e("a|^b").is_anchored_start());
@@ -1074,9 +1088,6 @@ mod tests {
10741088
assert!(e("a$|b$").is_anchored_end());
10751089
assert!(e("(a$)|(b$)").is_anchored_end());
10761090
assert!(e("((a|b)$)").is_anchored_end());
1077-
assert!(e("$*").is_anchored_end());
1078-
assert!(e("($)*").is_anchored_end());
1079-
assert!(e("(($)*)*").is_anchored_end());
10801091

10811092
assert!(!e("a$|b").is_anchored_end());
10821093
assert!(!e("a|b$").is_anchored_end());

regex_macros/Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ path = ".."
2222
version = "0.1"
2323
features = ["pattern"]
2424

25+
[dependencies.regex-syntax]
26+
path = "../regex-syntax"
27+
version = "0.2"
28+
2529
[dev-dependencies]
2630
lazy_static = "0.1"
2731
rand = "0.3"

regex_macros/src/lib.rs

+31-21
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,12 @@
1818
#![feature(plugin_registrar, quote, rustc_private)]
1919

2020
extern crate regex;
21-
extern crate syntax;
21+
extern crate regex_syntax;
2222
extern crate rustc_plugin;
23+
extern crate syntax;
24+
25+
use std::collections::BTreeMap;
26+
use std::usize;
2327

2428
use syntax::ast;
2529
use syntax::codemap;
@@ -32,7 +36,8 @@ use syntax::ptr::P;
3236

3337
use rustc_plugin::Registry;
3438

35-
use regex::internal::{Inst, EmptyLook, Program, ProgramBuilder};
39+
use regex::internal::{Compiler, EmptyLook, Inst, Program};
40+
use regex_syntax::Expr;
3641

3742
/// For the `regex!` syntax extension. Do not use.
3843
#[plugin_registrar]
@@ -67,15 +72,21 @@ fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
6772
};
6873
// We use the largest possible size limit because this is happening at
6974
// compile time. We trust the programmer.
70-
let bprog = ProgramBuilder::new(&regex).size_limit(::std::usize::MAX);
71-
let prog = match bprog.compile() {
75+
let expr = match Expr::parse(&regex) {
76+
Ok(expr) => expr,
77+
Err(err) => {
78+
cx.span_err(sp, &err.to_string());
79+
return DummyResult::any(sp)
80+
}
81+
};
82+
let prog = match Compiler::new().size_limit(usize::MAX).compile(&[expr]) {
7283
Ok(re) => re,
7384
Err(err) => {
7485
cx.span_err(sp, &err.to_string());
7586
return DummyResult::any(sp)
7687
}
7788
};
78-
let names = prog.cap_names.iter().cloned().collect();
89+
let names = prog.captures.iter().cloned().collect();
7990
let mut gen = NfaGen {
8091
cx: &*cx,
8192
sp: sp,
@@ -98,8 +109,8 @@ impl<'a> NfaGen<'a> {
98109
fn code(&mut self) -> P<ast::Expr> {
99110
// Most or all of the following things are used in the quasiquoted
100111
// expression returned.
101-
let num_cap_locs = 2 * self.prog.num_captures();
102-
let num_insts = self.prog.insts.len();
112+
let num_cap_locs = 2 * self.prog.captures.len();
113+
let num_insts = self.prog.len();
103114
let cap_names = self.vec_expr(self.names.iter(),
104115
&mut |cx, name| match *name {
105116
Some(ref name) => {
@@ -109,21 +120,20 @@ impl<'a> NfaGen<'a> {
109120
None => cx.expr_none(self.sp),
110121
}
111122
);
112-
let named_groups = {
113-
let mut named_groups = ::std::collections::BTreeMap::new();
123+
let capture_name_idx = {
124+
let mut capture_name_idx = BTreeMap::new();
114125
for (i, name) in self.names.iter().enumerate() {
115126
if let Some(ref name) = *name {
116-
named_groups.insert(name.to_owned(), i);
127+
capture_name_idx.insert(name.to_owned(), i);
117128
}
118129
}
119-
self.vec_expr(named_groups.iter(),
130+
self.vec_expr(capture_name_idx.iter(),
120131
&mut |cx, (name, group_idx)|
121132
quote_expr!(cx, ($name, $group_idx))
122133
)
123134
};
124135

125-
let prefix_anchor = self.prog.anchored_begin;
126-
136+
let is_anchored_start = self.prog.is_anchored_start;
127137
let step_insts = self.step_insts();
128138
let add_insts = self.add_insts();
129139
let regex = &*self.original;
@@ -135,9 +145,9 @@ impl<'a> NfaGen<'a> {
135145
// the user is only warned about *their* unused variable/code, and not the
136146
// unused code generated by regex!. See #14185 for an example.
137147
#[allow(dead_code)]
138-
static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names;
148+
static CAPTURES: &'static [Option<&'static str>] = &$cap_names;
139149
#[allow(dead_code)]
140-
static NAMED_GROUPS: &'static [(&'static str, usize)] = &$named_groups;
150+
static CAPTURE_NAME_IDX: &'static [(&'static str, usize)] = &$capture_name_idx;
141151

142152
#[allow(dead_code)]
143153
fn exec<'t>(
@@ -175,14 +185,14 @@ fn exec<'t>(
175185
clist.empty(); nlist.empty();
176186
'LOOP: loop {
177187
if clist.size == 0 {
178-
if matched || (!at.is_beginning() && $prefix_anchor) {
188+
if matched || (!at.is_start() && $is_anchored_start) {
179189
break;
180190
}
181191
// TODO: Prefix matching... Hmm.
182192
// Prefix matching now uses a DFA, so I think this is
183193
// going to require encoding that DFA statically.
184194
}
185-
if clist.size == 0 || (!$prefix_anchor && !matched) {
195+
if clist.size == 0 || (!$is_anchored_start && !matched) {
186196
self.add(clist, &mut caps, 0, at);
187197
}
188198
let at_next = self.input.at(at.next_pos());
@@ -322,8 +332,8 @@ fn exec<'t>(
322332

323333
::regex::Regex::Native(::regex::internal::ExNative {
324334
original: $regex,
325-
names: &CAP_NAMES,
326-
groups: &NAMED_GROUPS,
335+
names: &CAPTURES,
336+
groups: &CAPTURE_NAME_IDX,
327337
prog: exec,
328338
})
329339
})
@@ -332,7 +342,7 @@ fn exec<'t>(
332342
// Generates code for the `add` method, which is responsible for adding
333343
// zero-width states to the next queue of states to visit.
334344
fn add_insts(&self) -> P<ast::Expr> {
335-
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
345+
let arms = self.prog.iter().enumerate().map(|(pc, inst)| {
336346
let body = match *inst {
337347
Inst::EmptyLook(ref inst) => {
338348
let nextpc = inst.goto;
@@ -422,7 +432,7 @@ fn exec<'t>(
422432
// Generates the code for the `step` method, which processes all states
423433
// in the current queue that consume a single character.
424434
fn step_insts(&self) -> P<ast::Expr> {
425-
let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
435+
let arms = self.prog.iter().enumerate().map(|(pc, inst)| {
426436
let body = match *inst {
427437
Inst::Match => quote_expr!(self.cx, {
428438
for (slot, val) in caps.iter_mut().zip(thread_caps.iter()) {

0 commit comments

Comments
 (0)