From dc9378b0b5e18c5c516860a1070a7682964f1f6f Mon Sep 17 00:00:00 2001 From: Valentin Gatien-Baron Date: Mon, 11 May 2020 01:25:04 -0400 Subject: [PATCH 1/4] tests: add test that some counted repetition are rejected --- tests/noparse.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/noparse.rs b/tests/noparse.rs index 62eb5be77e..ec53afb804 100644 --- a/tests/noparse.rs +++ b/tests/noparse.rs @@ -26,6 +26,8 @@ noparse!(fail_bad_capture_name, "(?P)"); noparse!(fail_bad_flag, "(?a)a"); noparse!(fail_too_big, "a{10000000}"); noparse!(fail_counted_no_close, "a{1001"); +noparse!(fail_counted_decreasing, "a{2,1}"); +noparse!(fail_counted_nonnegative, "a{-1,1}"); noparse!(fail_unfinished_cap, "(?"); noparse!(fail_unfinished_escape, "\\"); noparse!(fail_octal_digit, r"\8"); From 89f4a3a19d340c58f61c748a4b1ae352970c83a7 Mon Sep 17 00:00:00 2001 From: Valentin Gatien-Baron Date: Mon, 11 May 2020 01:16:24 -0400 Subject: [PATCH 2/4] hir: make is_alternation_literal say false on Empty To avoid this assertion in tests when empty alternations are allowed: internal error: entered unreachable code: expected literal or concat, got Hir { kind: Empty, info: HirInfo { bools: 1795 } }', src/exec.rs:1568:18 The code in exec.rs relies on the documented invariant for is_alternation_literal: /// ... This is only /// true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s or an alternation of only `Literal`s. --- regex-syntax/src/hir/mod.rs | 12 ++++++------ regex-syntax/src/hir/translate.rs | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index d2f3902dd9..53d90b8425 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -241,8 +241,8 @@ impl Hir { info.set_any_anchored_start(false); info.set_any_anchored_end(false); info.set_match_empty(true); - info.set_literal(true); - info.set_alternation_literal(true); + info.set_literal(false); + info.set_alternation_literal(false); Hir { kind: HirKind::Empty, info: info } } @@ -671,8 +671,8 @@ impl Hir { /// true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s. /// - /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` - /// are not (even though that contain sub-expressions that are literals). + /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, + /// `` are not (even though that contain sub-expressions that are literals). pub fn is_literal(&self) -> bool { self.info.is_literal() } @@ -682,8 +682,8 @@ impl Hir { /// true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s or an alternation of only `Literal`s. /// - /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternaiton - /// literals, but `f+`, `(foo)`, `foo()` + /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation + /// literals, but `f+`, `(foo)`, `foo()`, `` /// are not (even though that contain sub-expressions that are literals). pub fn is_alternation_literal(&self) -> bool { self.info.is_alternation_literal() diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 2469890684..72f3f9aed6 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3105,13 +3105,13 @@ mod tests { #[test] fn analysis_is_literal() { // Positive examples. - assert!(t(r"").is_literal()); assert!(t(r"a").is_literal()); assert!(t(r"ab").is_literal()); assert!(t(r"abc").is_literal()); assert!(t(r"(?m)abc").is_literal()); // Negative examples. + assert!(!t(r"").is_literal()); assert!(!t(r"^").is_literal()); assert!(!t(r"a|b").is_literal()); assert!(!t(r"(a)").is_literal()); @@ -3124,7 +3124,6 @@ mod tests { #[test] fn analysis_is_alternation_literal() { // Positive examples. - assert!(t(r"").is_alternation_literal()); assert!(t(r"a").is_alternation_literal()); assert!(t(r"ab").is_alternation_literal()); assert!(t(r"abc").is_alternation_literal()); @@ -3135,6 +3134,7 @@ mod tests { assert!(t(r"foo|bar|baz").is_alternation_literal()); // Negative examples. + assert!(!t(r"").is_alternation_literal()); assert!(!t(r"^").is_alternation_literal()); assert!(!t(r"(a)").is_alternation_literal()); assert!(!t(r"a+").is_alternation_literal()); From bd63e7fa694cdcdd3106da1ee28bd7220a2b6dd1 Mon Sep 17 00:00:00 2001 From: Valentin Gatien-Baron Date: Mon, 11 May 2020 01:25:20 -0400 Subject: [PATCH 3/4] tests: add tests involving the Empty regex --- tests/crazy.rs | 12 ++++++++++++ tests/noparse.rs | 7 ------- tests/set.rs | 11 +++++++++++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/tests/crazy.rs b/tests/crazy.rs index 8c72273d93..56f6cadb90 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -118,6 +118,18 @@ matiter!(match_empty8, r"()+|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); matiter!(match_empty9, r"z|()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); matiter!(match_empty10, r"()+|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); matiter!(match_empty11, r"b|()+", "abc", (0, 0), (1, 2), (3, 3)); +matiter!(match_empty12, r"|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty13, r"b|", "abc", (0, 0), (1, 2), (3, 3)); +matiter!(match_empty14, r"|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty15, r"z|", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty16, r"|", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty17, r"||", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty18, r"||z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty19, r"(?:)|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty20, r"b|(?:)", "abc", (0, 0), (1, 2), (3, 3)); +matiter!(match_empty21, r"(?:|)", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty22, r"(?:|)|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty23, r"a(?:)|b", "abc", (0, 1), (1, 2)); // Test that the DFA can handle pathological cases. // (This should result in the DFA's cache being flushed too frequently, which diff --git a/tests/noparse.rs b/tests/noparse.rs index ec53afb804..8ded1dce7b 100644 --- a/tests/noparse.rs +++ b/tests/noparse.rs @@ -43,10 +43,3 @@ noparse!(fail_range_end_no_class, "[a-[:lower:]]"); noparse!(fail_range_end_no_begin, r"[a-\A]"); noparse!(fail_range_end_no_end, r"[a-\z]"); noparse!(fail_range_end_no_boundary, r"[a-\b]"); -noparse!(fail_empty_alt1, r"|z"); -noparse!(fail_empty_alt2, r"z|"); -noparse!(fail_empty_alt3, r"|"); -noparse!(fail_empty_alt4, r"||"); -noparse!(fail_empty_alt5, r"()|z"); -noparse!(fail_empty_alt6, r"z|()"); -noparse!(fail_empty_alt7, r"(|)"); diff --git a/tests/set.rs b/tests/set.rs index 3e9755cc26..648feec4be 100644 --- a/tests/set.rs +++ b/tests/set.rs @@ -17,6 +17,17 @@ matset!(set16, &["a"], "a", 0); matset!(set17, &[".*a"], "a", 0); matset!(set18, &["a", "β"], "β", 1); +// regexes that match the empty string +matset!(setempty1, &["", "a"], "abc", 0, 1); +matset!(setempty2, &["", "b"], "abc", 0, 1); +matset!(setempty3, &["", "z"], "abc", 0); +matset!(setempty4, &["a", ""], "abc", 0, 1); +matset!(setempty5, &["b", ""], "abc", 0, 1); +matset!(setempty6, &["z", ""], "abc", 1); +matset!(setempty7, &["b", "(?:)"], "abc", 0, 1); +matset!(setempty8, &["(?:)", "b"], "abc", 0, 1); +matset!(setempty9, &["c(?:)", "b"], "abc", 0, 1); + nomatset!(nset1, &["a", "a"], "b"); nomatset!(nset2, &["^foo", "bar$"], "bar foo"); nomatset!( From 1c50848c600e2fbda76eba811a1c1636bcd7eb9f Mon Sep 17 00:00:00 2001 From: Valentin Gatien-Baron Date: Mon, 11 May 2020 01:23:47 -0400 Subject: [PATCH 4/4] compile: support empty patterns better Some were rejected explicitly with this "alternations cannot currently contain empty sub-expressions" error. Some were causing alternations to be compiled into automaton matching a different regex. The root of the issue is that the compilation of the Empty pattern returns Patch { entry: self.insts.len(), .. }, that is it claims its entry point is the next instruction. This only works if the next instruction is unconditionally executed, which is not true for alternations. So we return None instead, and let the callers decide how to handle an empty regex. --- src/compile.rs | 250 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 166 insertions(+), 84 deletions(-) diff --git a/src/compile.rs b/src/compile.rs index 24184008c5..ad54040a76 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -15,6 +15,7 @@ use prog::{ use Error; type Result = result::Result; +type ResultOrEmpty = result::Result, Error>; #[derive(Debug)] struct Patch { @@ -132,7 +133,7 @@ impl Compiler { self.compiled.start = dotstar_patch.entry; } self.compiled.captures = vec![None]; - let patch = self.c_capture(0, expr)?; + let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst()); if self.compiled.needs_dotstar() { self.fill(dotstar_patch.hole, patch.entry); } else { @@ -167,14 +168,16 @@ impl Compiler { for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { self.fill_to_next(prev_hole); let split = self.push_split_hole(); - let Patch { hole, entry } = self.c_capture(0, expr)?; + let Patch { hole, entry } = + self.c_capture(0, expr)?.unwrap_or(self.next_inst()); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); self.push_compiled(Inst::Match(i)); prev_hole = self.fill_split(split, Some(entry), None); } let i = exprs.len() - 1; - let Patch { hole, entry } = self.c_capture(0, &exprs[i])?; + let Patch { hole, entry } = + self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst()); self.fill(prev_hole, entry); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); @@ -242,13 +245,16 @@ impl Compiler { /// method you will see that it does exactly this, though it handles /// a list of expressions rather than just the two that we use for /// an example. - fn c(&mut self, expr: &Hir) -> Result { + /// + /// Ok(None) is returned when an expression is compiled to no + /// instruction, and so no patch.entry value makes sense. + fn c(&mut self, expr: &Hir) -> ResultOrEmpty { use prog; use syntax::hir::HirKind::*; self.check_size()?; match *expr.kind() { - Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }), + Empty => Ok(None), Literal(hir::Literal::Unicode(c)) => self.c_char(c), Literal(hir::Literal::Byte(b)) => { assert!(self.compiled.uses_bytes()); @@ -357,7 +363,7 @@ impl Compiler { } } - fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result { + fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { if self.num_exprs > 1 || self.compiled.is_dfa { // Don't ever compile Save instructions for regex sets because // they are never used. They are also never used in DFA programs @@ -366,11 +372,11 @@ impl Compiler { } else { let entry = self.insts.len(); let hole = self.push_hole(InstHole::Save { slot: first_slot }); - let patch = self.c(expr)?; + let patch = self.c(expr)?.unwrap_or(self.next_inst()); self.fill(hole, patch.entry); self.fill_to_next(patch.hole); let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); - Ok(Patch { hole: hole, entry: entry }) + Ok(Some(Patch { hole: hole, entry: entry })) } } @@ -381,36 +387,38 @@ impl Compiler { greedy: false, hir: Box::new(Hir::any(true)), }))? + .unwrap() } else { self.c(&Hir::repetition(hir::Repetition { kind: hir::RepetitionKind::ZeroOrMore, greedy: false, hir: Box::new(Hir::any(false)), }))? + .unwrap() }) } - fn c_char(&mut self, c: char) -> Result { + fn c_char(&mut self, c: char) -> ResultOrEmpty { if self.compiled.uses_bytes() { if c.is_ascii() { let b = c as u8; let hole = self.push_hole(InstHole::Bytes { start: b, end: b }); self.byte_classes.set_range(b, b); - Ok(Patch { hole, entry: self.insts.len() - 1 }) + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } else { self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) } } else { let hole = self.push_hole(InstHole::Char { c: c }); - Ok(Patch { hole, entry: self.insts.len() - 1 }) + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } } - fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result { + fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { assert!(!ranges.is_empty()); if self.compiled.uses_bytes() { - CompileClass { c: self, ranges: ranges }.compile() + Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?)) } else { let ranges: Vec<(char, char)> = ranges.iter().map(|r| (r.start(), r.end())).collect(); @@ -419,15 +427,18 @@ impl Compiler { } else { self.push_hole(InstHole::Ranges { ranges: ranges }) }; - Ok(Patch { hole: hole, entry: self.insts.len() - 1 }) + Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) } } - fn c_byte(&mut self, b: u8) -> Result { + fn c_byte(&mut self, b: u8) -> ResultOrEmpty { self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) } - fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result { + fn c_class_bytes( + &mut self, + ranges: &[hir::ClassBytesRange], + ) -> ResultOrEmpty { debug_assert!(!ranges.is_empty()); let first_split_entry = self.insts.len(); @@ -451,35 +462,39 @@ impl Compiler { self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), ); self.fill(prev_hole, next); - Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry }) + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) } - fn c_empty_look(&mut self, look: EmptyLook) -> Result { + fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { let hole = self.push_hole(InstHole::EmptyLook { look: look }); - Ok(Patch { hole: hole, entry: self.insts.len() - 1 }) + Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) } - fn c_concat<'a, I>(&mut self, exprs: I) -> Result + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator, { let mut exprs = exprs.into_iter(); - let first = match exprs.next() { - Some(expr) => expr, - None => { - return Ok(Patch { hole: Hole::None, entry: self.insts.len() }) + let Patch { mut hole, entry } = loop { + match exprs.next() { + None => return Ok(None), + Some(e) => { + if let Some(p) = self.c(e)? { + break p; + } + } } }; - let Patch { mut hole, entry } = self.c(first)?; for e in exprs { - let p = self.c(e)?; - self.fill(hole, p.entry); - hole = p.hole; + if let Some(p) = self.c(e)? { + self.fill(hole, p.entry); + hole = p.hole; + } } - Ok(Patch { hole: hole, entry: entry }) + Ok(Some(Patch { hole: hole, entry: entry })) } - fn c_alternate(&mut self, exprs: &[Hir]) -> Result { + fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { debug_assert!( exprs.len() >= 2, "alternates must have at least 2 exprs" @@ -492,43 +507,43 @@ impl Compiler { // patched to point to the same location. let mut holes = vec![]; - let mut prev_hole = Hole::None; + // true indicates that the hole is a split where we want to fill + // the second branch. + let mut prev_hole = (Hole::None, false); for e in &exprs[0..exprs.len() - 1] { - self.fill_to_next(prev_hole); + if prev_hole.1 { + let next = self.insts.len(); + self.fill_split(prev_hole.0, None, Some(next)); + } else { + self.fill_to_next(prev_hole.0); + } let split = self.push_split_hole(); - let prev_entry = self.insts.len(); - let Patch { hole, entry } = self.c(e)?; - if prev_entry == self.insts.len() { - // TODO(burntsushi): It is kind of silly that we don't support - // empty-subexpressions in alternates, but it is supremely - // awkward to support them in the existing compiler - // infrastructure. This entire compiler needs to be thrown out - // anyway, so don't feel too bad. - return Err(Error::Syntax( - "alternations cannot currently contain \ - empty sub-expressions" - .to_string(), - )); + if let Some(Patch { hole, entry }) = self.c(e)? { + holes.push(hole); + prev_hole = (self.fill_split(split, Some(entry), None), false); + } else { + let (split1, split2) = split.dup_one(); + holes.push(split1); + prev_hole = (split2, true); } - holes.push(hole); - prev_hole = self.fill_split(split, Some(entry), None); } - let prev_entry = self.insts.len(); - let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?; - if prev_entry == self.insts.len() { - // TODO(burntsushi): See TODO above. - return Err(Error::Syntax( - "alternations cannot currently contain \ - empty sub-expressions" - .to_string(), - )); + if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { + holes.push(hole); + if prev_hole.1 { + self.fill_split(prev_hole.0, None, Some(entry)); + } else { + self.fill(prev_hole.0, entry); + } + } else { + // We ignore prev_hole.1. When it's true, it means we have two + // empty branches both pushing prev_hole.0 into holes, so both + // branches will go to the same place anyway. + holes.push(prev_hole.0); } - holes.push(hole); - self.fill(prev_hole, entry); - Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry }) + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) } - fn c_repeat(&mut self, rep: &hir::Repetition) -> Result { + fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { use syntax::hir::RepetitionKind::*; match rep.kind { ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), @@ -546,24 +561,37 @@ impl Compiler { } } - fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result { + fn c_repeat_zero_or_one( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { let split_entry = self.insts.len(); let split = self.push_split_hole(); - let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?; - + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; let split_hole = if greedy { self.fill_split(split, Some(entry_rep), None) } else { self.fill_split(split, None, Some(entry_rep)) }; let holes = vec![hole_rep, split_hole]; - Ok(Patch { hole: Hole::Many(holes), entry: split_entry }) + Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) } - fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result { + fn c_repeat_zero_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { let split_entry = self.insts.len(); let split = self.push_split_hole(); - let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?; + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; self.fill(hole_rep, split_entry); let split_hole = if greedy { @@ -571,11 +599,18 @@ impl Compiler { } else { self.fill_split(split, None, Some(entry_rep)) }; - Ok(Patch { hole: split_hole, entry: split_entry }) + Ok(Some(Patch { hole: split_hole, entry: split_entry })) } - fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result { - let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?; + fn c_repeat_one_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return Ok(None), + }; self.fill_to_next(hole_rep); let split = self.push_split_hole(); @@ -584,7 +619,7 @@ impl Compiler { } else { self.fill_split(split, None, Some(entry_rep)) }; - Ok(Patch { hole: split_hole, entry: entry_rep }) + Ok(Some(Patch { hole: split_hole, entry: entry_rep })) } fn c_repeat_range_min_or_more( @@ -592,12 +627,20 @@ impl Compiler { expr: &Hir, greedy: bool, min: u32, - ) -> Result { + ) -> ResultOrEmpty { let min = u32_to_usize(min); - let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; - let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?; - self.fill(patch_concat.hole, patch_rep.entry); - Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry }) + // Using next_inst() is ok, because we can't return it (concat would + // have to return Some(_) while c_repeat_range_min_or_more returns + // None). + let patch_concat = self + .c_concat(iter::repeat(expr).take(min))? + .unwrap_or(self.next_inst()); + if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { + self.fill(patch_concat.hole, patch_rep.entry); + Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) + } else { + Ok(None) + } } fn c_repeat_range( @@ -606,13 +649,17 @@ impl Compiler { greedy: bool, min: u32, max: u32, - ) -> Result { + ) -> ResultOrEmpty { let (min, max) = (u32_to_usize(min), u32_to_usize(max)); + debug_assert!(min <= max); let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; - let initial_entry = patch_concat.entry; if min == max { return Ok(patch_concat); } + // Same reasoning as in c_repeat_range_min_or_more (we know that min < + // max at this point). + let patch_concat = patch_concat.unwrap_or(self.next_inst()); + let initial_entry = patch_concat.entry; // It is much simpler to compile, e.g., `a{2,5}` as: // // aaa?a?a? @@ -637,7 +684,10 @@ impl Compiler { for _ in min..max { self.fill_to_next(prev_hole); let split = self.push_split_hole(); - let Patch { hole, entry } = self.c(expr)?; + let Patch { hole, entry } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; prev_hole = hole; if greedy { holes.push(self.fill_split(split, Some(entry), None)); @@ -646,7 +696,14 @@ impl Compiler { } } holes.push(prev_hole); - Ok(Patch { hole: Hole::Many(holes), entry: initial_entry }) + Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) + } + + /// Can be used as a default value for the c_* functions when the call to + /// c_function is followed by inserting at least one instruction that is + /// always executed after the ones written by the c* function. + fn next_inst(&self) -> Patch { + Patch { hole: Hole::None, entry: self.insts.len() } } fn fill(&mut self, hole: Hole, goto: InstPtr) { @@ -726,6 +783,11 @@ impl Compiler { Hole::One(hole) } + fn pop_split_hole(&mut self) -> ResultOrEmpty { + self.insts.pop(); + Ok(None) + } + fn check_size(&self) -> result::Result<(), Error> { use std::mem::size_of; @@ -744,6 +806,17 @@ enum Hole { Many(Vec), } +impl Hole { + fn dup_one(self) -> (Self, Self) { + match self { + Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), + Hole::None | Hole::Many(_) => { + unreachable!("must be called on single hole") + } + } + } +} + #[derive(Clone, Debug)] enum MaybeInst { Compiled(Inst), @@ -755,13 +828,22 @@ enum MaybeInst { impl MaybeInst { fn fill(&mut self, goto: InstPtr) { - let filled = match *self { - MaybeInst::Uncompiled(ref inst) => inst.fill(goto), + let maybeinst = match *self { + MaybeInst::Split => MaybeInst::Split1(goto), + MaybeInst::Uncompiled(ref inst) => { + MaybeInst::Compiled(inst.fill(goto)) + } MaybeInst::Split1(goto1) => { - Inst::Split(InstSplit { goto1: goto1, goto2: goto }) + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1: goto1, + goto2: goto, + })) } MaybeInst::Split2(goto2) => { - Inst::Split(InstSplit { goto1: goto, goto2: goto2 }) + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1: goto, + goto2: goto2, + })) } _ => unreachable!( "not all instructions were compiled! \ @@ -769,7 +851,7 @@ impl MaybeInst { self ), }; - *self = MaybeInst::Compiled(filled); + *self = maybeinst; } fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {