diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08cc60d9a8..2813a16763 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -141,7 +141,7 @@ jobs: - name: Install Rust uses: dtolnay/rust-toolchain@master with: - toolchain: 1.60.0 + toolchain: 1.65.0 # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it # turned out that on aarch64, it was using something that wasn't stabilized # until Rust 1.61[1]. (This was an oversight on my part. I had previously diff --git a/CHANGELOG.md b/CHANGELOG.md index a50b811ddf..b51142218f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,81 @@ +1.10.0 (2023-10-09) +=================== +This is a new minor release of `regex` that adds support for start and end +word boundary assertions. That is, `\<` and `\>`. The minimum supported Rust +version has also been raised to 1.65, which was released about one year ago. + +The new word boundary assertions are: + +* `\<` or `\b{start}`: a Unicode start-of-word boundary (`\W|\A` on the left, +`\w` on the right). +* `\>` or `\b{end}`: a Unicode end-of-word boundary (`\w` on the left, `\W|\z` +on the right)). +* `\b{start-half}`: half of a Unicode start-of-word boundary (`\W|\A` on the +left). +* `\b{end-half}`: half of a Unicode end-of-word boundary (`\W|\z` on the +right). + +The `\<` and `\>` are GNU extensions to POSIX regexes. They have been added +to the `regex` crate because they enjoy somewhat broad support in other regex +engines as well (for example, vim). The `\b{start}` and `\b{end}` assertions +are aliases for `\<` and `\>`, respectively. + +The `\b{start-half}` and `\b{end-half}` assertions are not found in any +other regex engine (although regex engines with general look-around support +can certainly express them). They were added principally to support the +implementation of word matching in grep programs, where one generally wants to +be a bit more flexible in what is considered a word boundary. + +New features: + +* [FEATURE #469](https://github.com/rust-lang/regex/issues/469): +Add support for `\<` and `\>` word boundary assertions. +* [FEATURE(regex-automata) #1031](https://github.com/rust-lang/regex/pull/1031): +DFAs now have a `start_state` method that doesn't use an `Input`. + +Performance improvements: + +* [PERF #1051](https://github.com/rust-lang/regex/pull/1051): +Unicode character class operations have been optimized in `regex-syntax`. +* [PERF #1090](https://github.com/rust-lang/regex/issues/1090): +Make patterns containing lots of literal characters use less memory. + +Bug fixes: + +* [BUG #1046](https://github.com/rust-lang/regex/issues/1046): +Fix a bug that could result in incorrect match spans when using a Unicode word +boundary and searching non-ASCII strings. +* [BUG(regex-syntax) #1047](https://github.com/rust-lang/regex/issues/1047): +Fix panics that can occur in `Ast->Hir` translation (not reachable from `regex` +crate). +* [BUG(regex-syntax) #1088](https://github.com/rust-lang/regex/issues/1088): +Remove guarantees in the API that connect the `u` flag with a specific HIR +representation. + +`regex-automata` breaking change release: + +This release includes a `regex-automata 0.4.0` breaking change release, which +was necessary in order to support the new word boundary assertions. For +example, the `Look` enum has new variants and the `LookSet` type now uses `u32` +instead of `u16` to represent a bitset of look-around assertions. These are +overall very minor changes, and most users of `regex-automata` should be able +to move to `0.4` from `0.3` without any changes at all. + +`regex-syntax` breaking change release: + +This release also includes a `regex-syntax 0.8.0` breaking change release, +which, like `regex-automata`, was necessary in order to support the new word +boundary assertions. This release also includes some changes to the `Ast` +type to reduce heap usage in some cases. If you are using the `Ast` type +directly, your code may require some minor modifications. Otherwise, users of +`regex-syntax 0.7` should be able to migrate to `0.8` without any code changes. + +`regex-lite` release: + +The `regex-lite 0.1.1` release contains support for the new word boundary +assertions. There are no breaking changes. + + 1.9.6 (2023-09-30) ================== This is a patch release that fixes a panic that can occur when the default diff --git a/Cargo.toml b/Cargo.toml index 46664f669d..6f94dc4ae3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ categories = ["text-processing"] autotests = false exclude = ["/scripts/*", "/.github/*"] edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" [workspace] members = [ diff --git a/HACKING.md b/HACKING.md deleted file mode 100644 index 34af5b517c..0000000000 --- a/HACKING.md +++ /dev/null @@ -1,341 +0,0 @@ -Your friendly guide to hacking and navigating the regex library. - -This guide assumes familiarity with Rust and Cargo, and at least a perusal of -the user facing documentation for this crate. - -If you're looking for background on the implementation in this library, then -you can do no better than Russ Cox's article series on implementing regular -expressions using finite automata: https://swtch.com/~rsc/regexp/ - - -## Architecture overview - -As you probably already know, this library executes regular expressions using -finite automata. In particular, a design goal is to make searching linear -with respect to both the regular expression and the text being searched. -Meeting that design goal on its own is not so hard and can be done with an -implementation of the Pike VM (similar to Thompson's construction, but supports -capturing groups), as described in: https://swtch.com/~rsc/regexp/regexp2.html ---- This library contains such an implementation in src/pikevm.rs. - -Making it fast is harder. One of the key problems with the Pike VM is that it -can be in more than one state at any point in time, and must shuffle capture -positions between them. The Pike VM also spends a lot of time following the -same epsilon transitions over and over again. We can employ one trick to -speed up the Pike VM: extract one or more literal prefixes from the regular -expression and execute specialized code to quickly find matches of those -prefixes in the search text. The Pike VM can then be avoided for most the -search, and instead only executed when a prefix is found. The code to find -prefixes is in the regex-syntax crate (in this repository). The code to search -for literals is in src/literals.rs. When more than one literal prefix is found, -we fall back to an Aho-Corasick DFA using the aho-corasick crate. For one -literal, we use a variant of the Boyer-Moore algorithm. Both Aho-Corasick and -Boyer-Moore use `memchr` when appropriate. The Boyer-Moore variant in this -library also uses elementary frequency analysis to choose the right byte to run -`memchr` with. - -Of course, detecting prefix literals can only take us so far. Not all regular -expressions have literal prefixes. To remedy this, we try another approach -to executing the Pike VM: backtracking, whose implementation can be found in -src/backtrack.rs. One reason why backtracking can be faster is that it avoids -excessive shuffling of capture groups. Of course, backtracking is susceptible -to exponential runtimes, so we keep track of every state we've visited to make -sure we never visit it again. This guarantees linear time execution, but we -pay for it with the memory required to track visited states. Because of the -memory requirement, we only use this engine on small search strings *and* small -regular expressions. - -Lastly, the real workhorse of this library is the "lazy" DFA in src/dfa.rs. -It is distinct from the Pike VM in that the DFA is explicitly represented in -memory and is only ever in one state at a time. It is said to be "lazy" because -the DFA is computed as text is searched, where each byte in the search text -results in at most one new DFA state. It is made fast by caching states. DFAs -are susceptible to exponential state blow up (where the worst case is computing -a new state for every input byte, regardless of what's in the state cache). To -avoid using a lot of memory, the lazy DFA uses a bounded cache. Once the cache -is full, it is wiped and state computation starts over again. If the cache is -wiped too frequently, then the DFA gives up and searching falls back to one of -the aforementioned algorithms. - -All of the above matching engines expose precisely the same matching semantics. -This is indeed tested. (See the section below about testing.) - -The following sub-sections describe the rest of the library and how each of the -matching engines are actually used. - -### Parsing - -Regular expressions are parsed using the regex-syntax crate, which is -maintained in this repository. The regex-syntax crate defines an abstract -syntax and provides very detailed error messages when a parse error is -encountered. Parsing is done in a separate crate so that others may benefit -from its existence, and because it is relatively divorced from the rest of the -regex library. - -The regex-syntax crate also provides sophisticated support for extracting -prefix and suffix literals from regular expressions. - -### Compilation - -The compiler is in src/compile.rs. The input to the compiler is some abstract -syntax for a regular expression and the output is a sequence of opcodes that -matching engines use to execute a search. (One can think of matching engines as -mini virtual machines.) The sequence of opcodes is a particular encoding of a -non-deterministic finite automaton. In particular, the opcodes explicitly rely -on epsilon transitions. - -Consider a simple regular expression like `a|b`. Its compiled form looks like -this: - - 000 Save(0) - 001 Split(2, 3) - 002 'a' (goto: 4) - 003 'b' - 004 Save(1) - 005 Match - -The first column is the instruction pointer and the second column is the -instruction. Save instructions indicate that the current position in the input -should be stored in a captured location. Split instructions represent a binary -branch in the program (i.e., epsilon transitions). The instructions `'a'` and -`'b'` indicate that the literal bytes `'a'` or `'b'` should match. - -In older versions of this library, the compilation looked like this: - - 000 Save(0) - 001 Split(2, 3) - 002 'a' - 003 Jump(5) - 004 'b' - 005 Save(1) - 006 Match - -In particular, empty instructions that merely served to move execution from one -point in the program to another were removed. Instead, every instruction has a -`goto` pointer embedded into it. This resulted in a small performance boost for -the Pike VM, because it was one fewer epsilon transition that it had to follow. - -There exist more instructions and they are defined and documented in -src/prog.rs. - -Compilation has several knobs and a few unfortunately complicated invariants. -Namely, the output of compilation can be one of two types of programs: a -program that executes on Unicode scalar values or a program that executes -on raw bytes. In the former case, the matching engine is responsible for -performing UTF-8 decoding and executing instructions using Unicode codepoints. -In the latter case, the program handles UTF-8 decoding implicitly, so that the -matching engine can execute on raw bytes. All matching engines can execute -either Unicode or byte based programs except for the lazy DFA, which requires -byte based programs. In general, both representations were kept because (1) the -lazy DFA requires byte based programs so that states can be encoded in a memory -efficient manner and (2) the Pike VM benefits greatly from inlining Unicode -character classes into fewer instructions as it results in fewer epsilon -transitions. - -N.B. UTF-8 decoding is built into the compiled program by making use of the -utf8-ranges crate. The compiler in this library factors out common suffixes to -reduce the size of huge character classes (e.g., `\pL`). - -A regrettable consequence of this split in instruction sets is we generally -need to compile two programs; one for NFA execution and one for the lazy DFA. - -In fact, it is worse than that: the lazy DFA is not capable of finding the -starting location of a match in a single scan, and must instead execute a -backwards search after finding the end location. To execute a backwards search, -we must have compiled the regular expression *in reverse*. - -This means that every compilation of a regular expression generally results in -three distinct programs. It would be possible to lazily compile the Unicode -program, since it is never needed if (1) the regular expression uses no word -boundary assertions and (2) the caller never asks for sub-capture locations. - -### Execution - -At the time of writing, there are four matching engines in this library: - -1. The Pike VM (supports captures). -2. Bounded backtracking (supports captures). -3. Literal substring or multi-substring search. -4. Lazy DFA (no support for Unicode word boundary assertions). - -Only the first two matching engines are capable of executing every regular -expression program. They also happen to be the slowest, which means we need -some logic that (1) knows various facts about the regular expression and (2) -knows what the caller wants. Using this information, we can determine which -engine (or engines) to use. - -The logic for choosing which engine to execute is in src/exec.rs and is -documented on the Exec type. Exec values contain regular expression Programs -(defined in src/prog.rs), which contain all the necessary tidbits for actually -executing a regular expression on search text. - -For the most part, the execution logic is straight-forward and follows the -limitations of each engine described above pretty faithfully. The hairiest -part of src/exec.rs by far is the execution of the lazy DFA, since it requires -a forwards and backwards search, and then falls back to either the Pike VM or -backtracking if the caller requested capture locations. - -The Exec type also contains mutable scratch space for each type of matching -engine. This scratch space is used during search (for example, for the lazy -DFA, it contains compiled states that are reused on subsequent searches). - -### Programs - -A regular expression program is essentially a sequence of opcodes produced by -the compiler plus various facts about the regular expression (such as whether -it is anchored, its capture names, etc.). - -### The regex! macro - -The `regex!` macro no longer exists. It was developed in a bygone era as a -compiler plugin during the infancy of the regex crate. Back then, then only -matching engine in the crate was the Pike VM. The `regex!` macro was, itself, -also a Pike VM. The only advantages it offered over the dynamic Pike VM that -was built at runtime were the following: - - 1. Syntax checking was done at compile time. Your Rust program wouldn't - compile if your regex didn't compile. - 2. Reduction of overhead that was proportional to the size of the regex. - For the most part, this overhead consisted of heap allocation, which - was nearly eliminated in the compiler plugin. - -The main takeaway here is that the compiler plugin was a marginally faster -version of a slow regex engine. As the regex crate evolved, it grew other regex -engines (DFA, bounded backtracker) and sophisticated literal optimizations. -The regex macro didn't keep pace, and it therefore became (dramatically) slower -than the dynamic engines. The only reason left to use it was for the compile -time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint -tool) has a lint that checks your regular expression validity, which mostly -replaces that use case. - -Additionally, the regex compiler plugin stopped receiving maintenance. Nobody -complained. At that point, it seemed prudent to just remove it. - -Will a compiler plugin be brought back? The future is murky, but there is -definitely an opportunity there to build something that is faster than the -dynamic engines in some cases. But it will be challenging! As of now, there -are no plans to work on this. - - -## Testing - -A key aspect of any mature regex library is its test suite. A subset of the -tests in this library come from Glenn Fowler's AT&T test suite (its online -presence seems gone at the time of writing). The source of the test suite is -located in src/testdata. The scripts/regex-match-tests.py takes the test suite -in src/testdata and generates tests/matches.rs. - -There are also many other manually crafted tests and regression tests in -tests/tests.rs. Some of these tests were taken from RE2. - -The biggest source of complexity in the tests is related to answering this -question: how can we reuse the tests to check all of our matching engines? One -approach would have been to encode every test into some kind of format (like -the AT&T test suite) and code generate tests for each matching engine. The -approach we use in this library is to create a Cargo.toml entry point for each -matching engine we want to test. The entry points are: - -* `tests/test_default.rs` - tests `Regex::new` -* `tests/test_default_bytes.rs` - tests `bytes::Regex::new` -* `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex. -* `tests/test_nfa_bytes.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex and use *arbitrary* byte based programs. -* `tests/test_nfa_utf8bytes.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex and use *UTF-8* byte based programs. -* `tests/test_backtrack.rs` - tests `Regex::new`, forced to use - backtracking on every regex. -* `tests/test_backtrack_bytes.rs` - tests `Regex::new`, forced to use - backtracking on every regex and use *arbitrary* byte based programs. -* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use - backtracking on every regex and use *UTF-8* byte based programs. -* `tests/test_crates_regex.rs` - tests to make sure that all of the - backends behave in the same way against a number of quickcheck - generated random inputs. These tests need to be enabled through - the `RUST_REGEX_RANDOM_TEST` environment variable (see - below). - -The lazy DFA and pure literal engines are absent from this list because -they cannot be used on every regular expression. Instead, we rely on -`tests/test_dynamic.rs` to test the lazy DFA and literal engines when possible. - -Since the tests are repeated several times, and because `cargo test` runs all -entry points, it can take a while to compile everything. To reduce compile -times slightly, try using `cargo test --test default`, which will only use the -`tests/test_default.rs` entry point. - -The random testing takes quite a while, so it is not enabled by default. -In order to run the random testing you can set the -`RUST_REGEX_RANDOM_TEST` environment variable to anything before -invoking `cargo test`. Note that this variable is inspected at compile -time, so if the tests don't seem to be running, you may need to run -`cargo clean`. - -## Benchmarking - -The benchmarking in this crate is made up of many micro-benchmarks. Currently, -there are two primary sets of benchmarks: the benchmarks that were adopted -at this library's inception (in `bench/src/misc.rs`) and a newer set of -benchmarks meant to test various optimizations. Specifically, the latter set -contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter -set are all executed on the same lengthy input whereas the former benchmarks -are executed on strings of varying length. - -There is also a smattering of benchmarks for parsing and compilation. - -Benchmarks are in a separate crate so that its dependencies can be managed -separately from the main regex crate. - -Benchmarking follows a similarly wonky setup as tests. There are multiple entry -points: - -* `bench_rust.rs` - benchmarks `Regex::new` -* `bench_rust_bytes.rs` benchmarks `bytes::Regex::new` -* `bench_pcre.rs` - benchmarks PCRE -* `bench_onig.rs` - benchmarks Oniguruma - -The PCRE and Oniguruma benchmarks exist as a comparison point to a mature -regular expression library. In general, this regex library compares favorably -(there are even a few benchmarks that PCRE simply runs too slowly on or -outright can't execute at all). I would love to add other regular expression -library benchmarks (especially RE2). - -If you're hacking on one of the matching engines and just want to see -benchmarks, then all you need to run is: - - $ (cd bench && ./run rust) - -If you want to compare your results with older benchmarks, then try: - - $ (cd bench && ./run rust | tee old) - $ ... make it faster - $ (cd bench && ./run rust | tee new) - $ cargo benchcmp old new --improvements - -The `cargo-benchcmp` utility is available here: -https://github.com/BurntSushi/cargo-benchcmp - -The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See -`./bench/bench --help`. - -## Dev Docs - -When digging your teeth into the codebase for the first time, the -crate documentation can be a great resource. By default `rustdoc` -will strip out all documentation of private crate members in an -effort to help consumers of the crate focus on the *interface* -without having to concern themselves with the *implementation*. -Normally this is a great thing, but if you want to start hacking -on regex internals it is not what you want. Many of the private members -of this crate are well documented with rustdoc style comments, and -it would be a shame to miss out on the opportunity that presents. -You can generate the private docs with: - -``` -$ rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments -``` - -Then just point your browser at `target/doc/regex/index.html`. - -See https://github.com/rust-lang/rust/issues/15347 for more info -about generating developer docs for internal use. diff --git a/README.md b/README.md index 7454c166d5..f1e4c404ad 100644 --- a/README.md +++ b/README.md @@ -290,9 +290,24 @@ $ rebar cmp results.csv See the `rebar` documentation for more details on how it works and how to compare results with other regex engines. + +### Hacking + +The `regex` crate is, for the most part, a pretty thin wrapper around the +[`meta::Regex`](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html) +from the +[`regex-automata` crate](https://docs.rs/regex-automata/latest/regex_automata/). +Therefore, if you're looking to work on the internals of this crate, you'll +likely either want to look in `regex-syntax` (for parsing) or `regex-automata` +(for construction of finite automata and the search routines). + +My [blog on regex internals](https://blog.burntsushi.net/regex-internals/) +goes into more depth. + + ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.60.0`. +This crate's minimum supported `rustc` version is `1.65.0`. The policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if regex 1.0 requires Rust diff --git a/fuzz/ast-fuzzers.options b/fuzz/ast-fuzzers.options new file mode 100644 index 0000000000..678d526b1e --- /dev/null +++ b/fuzz/ast-fuzzers.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/fuzz/fuzz_targets/ast_roundtrip.rs b/fuzz/fuzz_targets/ast_roundtrip.rs index 040b59d631..c35ac962e0 100644 --- a/fuzz/fuzz_targets/ast_roundtrip.rs +++ b/fuzz/fuzz_targets/ast_roundtrip.rs @@ -3,7 +3,7 @@ use { libfuzzer_sys::{fuzz_target, Corpus}, regex_syntax::ast::{ - parse::Parser, visit, Ast, Flag, Group, GroupKind, SetFlags, Visitor, + parse::Parser, visit, Ast, Flag, Flags, GroupKind, Visitor, }, }; @@ -32,16 +32,17 @@ impl Visitor for VerboseVisitor { } fn visit_pre(&mut self, ast: &Ast) -> Result { + let reject_flags = |flags: &Flags| { + flags.flag_state(Flag::IgnoreWhitespace).unwrap_or(false) + }; match ast { - Ast::Flags(SetFlags { flags, .. }) - | Ast::Group(Group { - kind: GroupKind::NonCapturing(flags), .. - }) if flags - .flag_state(Flag::IgnoreWhitespace) - .unwrap_or(false) => - { - Err(()) - } + Ast::Flags(x) if reject_flags(&x.flags) => return Err(()), + Ast::Group(x) => match x.kind { + GroupKind::NonCapturing(ref flags) if reject_flags(flags) => { + return Err(()) + } + _ => Ok(()), + }, _ => Ok(()), } } diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs index 579078c71e..155fa6d8dc 100644 --- a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs @@ -57,8 +57,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus { .dot_matches_new_line(case.dot_matches_new_line) .swap_greed(case.swap_greed) .ignore_whitespace(case.ignore_whitespace) - .size_limit(1<<20) - .build() else { return Corpus::Reject }; + .size_limit(1 << 16) + .build() + else { + return Corpus::Reject; + }; re.is_match(case.haystack); Corpus::Keep }); diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs index 6c375510d0..a5dda53d65 100644 --- a/fuzz/fuzz_targets/fuzz_regex_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_match.rs @@ -54,6 +54,9 @@ re.is_match({haystack:?}); fuzz_target!(|case: FuzzCase| -> Corpus { let _ = env_logger::try_init(); + if case.pattern.len() > (16 * (1 << 10)) { + return Corpus::Reject; + } if case.haystack.len() > (16 * (1 << 10)) { return Corpus::Reject; } @@ -65,8 +68,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus { .ignore_whitespace(case.ignore_whitespace) .unicode(case.unicode) .octal(case.octal) - .size_limit(1<<18) - .build() else { return Corpus::Reject }; + .size_limit(1 << 18) + .build() + else { + return Corpus::Reject; + }; re.is_match(case.haystack); Corpus::Keep }); diff --git a/fuzz/oss-fuzz-build.sh b/fuzz/oss-fuzz-build.sh index f96474739c..81f619dcb5 100755 --- a/fuzz/oss-fuzz-build.sh +++ b/fuzz/oss-fuzz-build.sh @@ -14,5 +14,8 @@ targets=( ast_fuzz_match_bytes ) for target in "${targets[@]}"; do - cp fuzz/target/x86_64-unknown-linux-gnu/release/$target $OUT/ + cp "fuzz/target/x86_64-unknown-linux-gnu/release/${target}" "${OUT}/" + if [[ "$target" == ast_* ]]; then + cp fuzz/ast-fuzzers.options "${OUT}/${target}.options" + fi done diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 new file mode 100644 index 0000000000..8de974975d Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 new file mode 100644 index 0000000000..a34eeaf2c0 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344 new file mode 100644 index 0000000000..ce5b868b95 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 new file mode 100644 index 0000000000..726609cf21 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 new file mode 100644 index 0000000000..233fcbc950 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536 new file mode 100644 index 0000000000..d4a35d1d10 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 new file mode 100644 index 0000000000..3056bca2f3 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 new file mode 100644 index 0000000000..cac835c53e Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 new file mode 100644 index 0000000000..d892bc31c4 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 new file mode 100644 index 0000000000..8612658526 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 differ diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 new file mode 100644 index 0000000000..b8cdc138a4 Binary files /dev/null and b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 differ diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 7d47140b06..2d08cec75c 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -11,6 +11,7 @@ license = "MIT OR Apache-2.0" categories = ["text-processing"] edition = "2021" autoexamples = false +rust-version = "1.65" [lib] bench = false diff --git a/regex-automata/src/dfa/accel.rs b/regex-automata/src/dfa/accel.rs index 5ea2423dd0..c0ba18ea89 100644 --- a/regex-automata/src/dfa/accel.rs +++ b/regex-automata/src/dfa/accel.rs @@ -6,15 +6,16 @@ // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its // DFA with regex-cli: // -// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC -// dense::DFA( +// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table // D 000000: // Q 000001: // *000002: -// A 000003: \x00-` => 3, a => 5, b-\xFF => 3 -// >000004: \x00-` => 3, a => 4, b-\xFF => 3 -// 000005: \x00-\xFF => 2, EOI => 2 -// ) +// A 000003: \x00-` => 3, a => 8, b-\xFF => 3 +// A 000004: \x00-` => 4, a => 7, b-\xFF => 4 +// 000005: \x00-` => 4, b-\xFF => 4 +// 000006: \x00-` => 3, a => 6, b-\xFF => 3 +// 000007: \x00-\xFF => 2, EOI => 2 +// 000008: \x00-\xFF => 2, EOI => 2 // // In particular, state 3 is accelerated (shown via the 'A' indicator) since // the only way to leave that state once entered is to see an 'a' byte. If diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs index 7e2be9a151..fcfcf29975 100644 --- a/regex-automata/src/dfa/automaton.rs +++ b/regex-automata/src/dfa/automaton.rs @@ -7,6 +7,7 @@ use crate::{ prefilter::Prefilter, primitives::{PatternID, StateID}, search::{Anchored, HalfMatch, Input, MatchError}, + start, }, }; @@ -226,8 +227,8 @@ pub unsafe trait Automaton { /// ``` fn next_eoi_state(&self, current: StateID) -> StateID; - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this DFA for the given starting + /// configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -235,12 +236,41 @@ pub unsafe trait Automaton { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it may + /// be more succinct to use [`Automaton::start_state_forward`] or + /// [`Automaton::start_state_reverse`]. Note, for example, that the + /// convenience routines return a [`MatchError`] on failure where as this + /// routine returns a [`StartError`]. + /// + /// # Errors + /// + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte). + /// This can also return an error if the given configuration contains an + /// unsupported [`Anchored`] configuration. + fn start_state( + &self, + config: &start::Config, + ) -> Result; + + /// Return the ID of the start state for this DFA when executing a forward + /// search. + /// + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -251,23 +281,30 @@ pub unsafe trait Automaton { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_forward(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } - /// Return the ID of the start state for this lazy DFA when executing a - /// reverse search. + /// Return the ID of the start state for this DFA when executing a reverse + /// search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -278,7 +315,18 @@ pub unsafe trait Automaton { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_reverse(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } /// If this DFA has a universal starting state for the given anchor mode /// and the DFA supports universal starting states, then this returns that @@ -1084,7 +1132,7 @@ pub unsafe trait Automaton { /// // implementation defined. /// // /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'. - /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`. + /// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`. /// let id = StateID::new(3 * dfa.stride()).unwrap(); /// let accelerator = dfa.accelerator(id); /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated. @@ -1798,6 +1846,14 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { (**self).next_eoi_state(current) } + #[inline] + fn start_state( + &self, + config: &start::Config, + ) -> Result { + (**self).start_state(config) + } + #[inline] fn start_state_forward( &self, @@ -2015,6 +2071,90 @@ impl OverlappingState { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either based on +/// incorrect configuration or even based on whether the look-behind byte +/// triggers a quit state. Typically one does not need to handle this error +/// if you're using [`Automaton::start_state_forward`] (or its reverse +/// counterpart), as that routine automatically converts `StartError` to a +/// [`MatchError`] for you. +/// +/// This error may be returned by the [`Automaton::start_state`] routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError {} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// Runs the given overlapping `search` function (forwards or backwards) until /// a match is found whose offset does not split a codepoint. /// diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 6da865f977..fd96bc878c 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -30,7 +30,7 @@ use crate::{ use crate::{ dfa::{ accel::Accels, - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, special::Special, start::StartKind, DEAD, @@ -40,8 +40,8 @@ use crate::{ int::{Pointer, Usize}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -66,8 +66,9 @@ const VERSION: u32 = 2; /// /// The default configuration guarantees that a search will never return /// a "quit" error, although it is possible for a search to fail if -/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by -/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. +/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is +/// not by default) and an [`Anchored::Pattern`] mode is requested via +/// [`Input`](crate::Input). #[cfg(feature = "dfa-build")] #[derive(Clone, Debug, Default)] pub struct Config { @@ -113,8 +114,7 @@ impl Config { /// make searching slower than it otherwise would be if the transitions /// that leave accelerated states are traversed frequently. /// - /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for - /// an example. + /// See [`Automaton::accelerator`] for an example. /// /// This is enabled by default. pub fn accelerate(mut self, yes: bool) -> Config { @@ -882,20 +882,20 @@ impl Config { /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// - /// // 600KB isn't enough! + /// // 700KB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(600_000)) + /// .determinize_size_limit(Some(700_000)) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// - /// // ... but 700KB probably is! + /// // ... but 800KB probably is! /// // (Note that auxiliary storage sizes aren't necessarily stable between /// // releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(700_000)) + /// .determinize_size_limit(Some(800_000)) /// ) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); @@ -1228,13 +1228,14 @@ impl Builder { } else { let mut set = nfa.byte_class_set().clone(); // It is important to distinguish any "quit" bytes from all other - // bytes. Otherwise, a non-quit byte may end up in the same class - // as a quit byte, and thus cause the DFA stop when it shouldn't. + // bytes. Otherwise, a non-quit byte may end up in the same + // class as a quit byte, and thus cause the DFA to stop when it + // shouldn't. // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match dense --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quitset.is_empty() { set.add_set(&quitset); } @@ -2345,6 +2346,24 @@ impl<'a> DFA<&'a [u32]> { dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. + for state in dfa.states() { + // If the state is an accel state, then it must have a non-empty + // accelerator. + if dfa.is_accel_state(state.id()) { + let index = dfa.accelerator_index(state.id()); + if index >= dfa.accels.len() { + return Err(DeserializeError::generic( + "found DFA state with invalid accelerator index", + )); + } + let needles = dfa.accels.needles(index); + if !(1 <= needles.len() && needles.len() <= 3) { + return Err(DeserializeError::generic( + "accelerator needles has invalid length", + )); + } + } + } Ok((dfa, nread)) } @@ -2885,31 +2904,33 @@ impl OwnedDFA { fn set_universal_starts(&mut self) { assert_eq!(6, Start::len(), "expected 6 start configurations"); - let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { + let start_id = |dfa: &mut OwnedDFA, + anchored: Anchored, + start: Start| { // This OK because we only call 'start' under conditions // in which we know it will succeed. - dfa.st.start(inp, start).expect("valid Input configuration") + dfa.st.start(anchored, start).expect("valid Input configuration") }; if self.start_kind().has_unanchored() { - let inp = Input::new("").anchored(Anchored::No); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::No; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_unanchored = Some(sid); } } if self.start_kind().has_anchored() { - let inp = Input::new("").anchored(Anchored::Yes); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::Yes; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_anchored = Some(sid); } @@ -3216,35 +3237,21 @@ unsafe impl> Automaton for DFA { } #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_forward( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_reverse( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -4180,28 +4187,27 @@ impl> StartTable { #[cfg_attr(feature = "perf-inline", inline(always))] fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; @@ -5086,6 +5092,8 @@ impl core::fmt::Display for BuildError { #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { + use crate::{Input, MatchError}; + use super::*; #[test] diff --git a/regex-automata/src/dfa/mod.rs b/regex-automata/src/dfa/mod.rs index 4bb8704352..fd58cac23a 100644 --- a/regex-automata/src/dfa/mod.rs +++ b/regex-automata/src/dfa/mod.rs @@ -320,7 +320,7 @@ dramatically. #[cfg(feature = "dfa-search")] pub use crate::dfa::{ - automaton::{Automaton, OverlappingState}, + automaton::{Automaton, OverlappingState, StartError}, start::StartKind, }; diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 44691d0c8a..e62bbd383e 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -2581,10 +2581,11 @@ impl Cache { /// Represents a single transition in a one-pass DFA. /// -/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds -/// to the transition epsilons, which contains the slots that should be saved -/// when this transition is followed and the conditional epsilon transitions -/// that must be satisfied in order to follow this transition. +/// The high 21 bits corresponds to the state ID. The bit following corresponds +/// to the special "match wins" flag. The remaining low 42 bits corresponds to +/// the transition epsilons, which contains the slots that should be saved when +/// this transition is followed and the conditional epsilon transitions that +/// must be satisfied in order to follow this transition. #[derive(Clone, Copy, Eq, PartialEq)] struct Transition(u64); @@ -2741,7 +2742,7 @@ impl PatternEpsilons { fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons { PatternEpsilons( (self.0 & PatternEpsilons::PATTERN_ID_MASK) - | u64::from(epsilons.0), + | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK), ) } } @@ -2814,12 +2815,15 @@ impl Epsilons { /// Return the set of look-around assertions in these epsilon transitions. fn looks(self) -> LookSet { - LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() } + LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() } } /// Set the look-around assertions on these epsilon transitions. fn set_looks(self, look_set: LookSet) -> Epsilons { - Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits)) + Epsilons( + (self.0 & Epsilons::SLOT_MASK) + | (u64::from(look_set.bits) & Epsilons::LOOK_MASK), + ) } } diff --git a/regex-automata/src/dfa/regex.rs b/regex-automata/src/dfa/regex.rs index f39c1c055c..5e7e6e38ac 100644 --- a/regex-automata/src/dfa/regex.rs +++ b/regex-automata/src/dfa/regex.rs @@ -853,7 +853,7 @@ impl Builder { } /// Set the dense DFA compilation configuration for this builder using - /// [`dense::Config`](dense::Config). + /// [`dense::Config`]. /// /// This permits setting things like whether the underlying DFAs should /// be minimized. diff --git a/regex-automata/src/dfa/search.rs b/regex-automata/src/dfa/search.rs index 8c012a5944..5a82261f97 100644 --- a/regex-automata/src/dfa/search.rs +++ b/regex-automata/src/dfa/search.rs @@ -176,7 +176,6 @@ fn find_fwd_imp( // It's important that this is a debug_assert, since this can // actually be tripped even if DFA::from_bytes succeeds and // returns a supposedly valid DFA. - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -297,7 +296,6 @@ fn find_rev_imp( } else if dfa.is_dead_state(sid) { return Ok(mat); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -422,7 +420,6 @@ fn find_overlapping_fwd_imp( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -526,7 +523,6 @@ pub(crate) fn find_overlapping_rev( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -600,9 +596,6 @@ fn eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } } Ok(()) @@ -631,9 +624,6 @@ fn eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } Ok(()) } diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 5d8ec23408..38096d9946 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -3,13 +3,12 @@ Types and routines specific to sparse DFAs. This module is the home of [`sparse::DFA`](DFA). -Unlike the [`dense`](super::dense) module, this module does not contain a -builder or configuration specific for sparse DFAs. Instead, the intended -way to build a sparse DFA is either by using a default configuration with -its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the -construction of a dense DFA with [`dense::Builder`](super::dense::Builder) -and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For -example, this configures a sparse DFA to do an overlapping search: +Unlike the [`dense`] module, this module does not contain a builder or +configuration specific for sparse DFAs. Instead, the intended way to build a +sparse DFA is either by using a default configuration with its constructor +[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a +dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`]. +For example, this configures a sparse DFA to do an overlapping search: ``` use regex_automata::{ @@ -52,7 +51,7 @@ use alloc::{vec, vec::Vec}; use crate::dfa::dense::{self, BuildError}; use crate::{ dfa::{ - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, dense::Flags, special::Special, StartKind, DEAD, @@ -63,8 +62,8 @@ use crate::{ int::{Pointer, Usize, U16, U32}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -74,18 +73,17 @@ const VERSION: u32 = 2; /// A sparse deterministic finite automaton (DFA) with variable sized states. /// -/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses -/// a more space efficient representation for its transitions. Consequently, -/// sparse DFAs may use much less memory than dense DFAs, but this comes at a -/// price. In particular, reading the more space efficient transitions takes -/// more work, and consequently, searching using a sparse DFA is typically -/// slower than a dense DFA. +/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient +/// representation for its transitions. Consequently, sparse DFAs may use much +/// less memory than dense DFAs, but this comes at a price. In particular, +/// reading the more space efficient transitions takes more work, and +/// consequently, searching using a sparse DFA is typically slower than a dense +/// DFA. /// /// A sparse DFA can be built using the default configuration via the -/// [`DFA::new`] constructor. Otherwise, one can configure various aspects -/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder), -/// and then convert a dense DFA to a sparse DFA using -/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse). +/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a +/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse +/// DFA using [`dense::DFA::to_sparse`]. /// /// In general, a sparse DFA supports all the same search operations as a dense /// DFA. @@ -140,11 +138,9 @@ impl DFA> { /// Parse the given regular expression using a default configuration and /// return the corresponding sparse DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -167,11 +163,9 @@ impl DFA> { /// Parse the given regular expressions using a default configuration and /// return the corresponding multi-DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -511,10 +505,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -553,10 +546,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -595,10 +587,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// Generally speaking, native endian format should only be used when /// you know that the target you're compiling the DFA for matches the @@ -903,9 +894,9 @@ impl<'a> DFA<&'a [u8]> { /// /// If any of the above are not true, then an error will be returned. /// - /// Note that unlike deserializing a - /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has - /// no alignment requirements. That is, an alignment of `1` is valid. + /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse + /// DFA has no alignment requirements. That is, an alignment of `1` is + /// valid. /// /// # Panics /// @@ -1001,8 +992,8 @@ impl<'a> DFA<&'a [u8]> { // (by trying to decode every state) and start state ID list below. If // either validation fails, then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.special, &dfa.tt)?; + let seen = dfa.tt.validate(&dfa.special)?; + dfa.st.validate(&dfa.special, &seen)?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. Ok((dfa, nread)) @@ -1207,35 +1198,21 @@ unsafe impl> Automaton for DFA { } #[inline] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[inline] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[inline] @@ -1411,63 +1388,8 @@ impl> Transitions { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { - // In order to validate everything, we not only need to make sure we - // can decode every state, but that every transition in every state - // points to a valid state. There are many duplicative transitions, so - // we record state IDs that we've verified so that we don't redo the - // decoding work. - // - // Except, when in no_std mode, we don't have dynamic memory allocation - // available to us, so we skip this optimization. It's not clear - // whether doing something more clever is worth it just yet. If you're - // profiling this code and need it to run faster, please file an issue. - // - // OK, so we also use this to record the set of valid state IDs. Since - // it is possible for a transition to point to an invalid state ID that - // still (somehow) deserializes to a valid state. So we need to make - // sure our transitions are limited to actually correct state IDs. - // The problem is, I'm not sure how to do this verification step in - // no-std no-alloc mode. I think we'd *have* to store the set of valid - // state IDs in the DFA itself. For now, we don't do this verification - // in no-std no-alloc mode. The worst thing that can happen is an - // incorrect result. But no panics or memory safety problems should - // result. Because we still do validate that the state itself is - // "valid" in the sense that everything it points to actually exists. - // - // ---AG - struct Seen { - #[cfg(feature = "alloc")] - set: alloc::collections::BTreeSet, - #[cfg(not(feature = "alloc"))] - set: core::marker::PhantomData, - } - - #[cfg(feature = "alloc")] - impl Seen { - fn new() -> Seen { - Seen { set: alloc::collections::BTreeSet::new() } - } - fn insert(&mut self, id: StateID) { - self.set.insert(id); - } - fn contains(&self, id: &StateID) -> bool { - self.set.contains(id) - } - } - - #[cfg(not(feature = "alloc"))] - impl Seen { - fn new() -> Seen { - Seen { set: core::marker::PhantomData } - } - fn insert(&mut self, _id: StateID) {} - fn contains(&self, _id: &StateID) -> bool { - false - } - } - - let mut verified: Seen = Seen::new(); + fn validate(&self, sp: &Special) -> Result { + let mut verified = Seen::new(); // We need to make sure that we decode the correct number of states. // Otherwise, an empty set of transitions would validate even if the // recorded state length is non-empty. @@ -1544,7 +1466,7 @@ impl> Transitions { "mismatching sparse state length", )); } - Ok(()) + Ok(verified) } /// Converts these transitions to a borrowed value. @@ -1682,7 +1604,7 @@ impl> Transitions { let state = &state[nr..]; if npats == 0 { return Err(DeserializeError::generic( - "state marked as a match, but has no pattern IDs", + "state marked as a match, but pattern length is zero", )); } @@ -1704,6 +1626,21 @@ impl> Transitions { } else { (&[][..], state) }; + if is_match && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked special as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) != is_match { + return Err(DeserializeError::generic( + "whether state is a match or not is inconsistent", + )); + } // Now read this state's accelerator info. The first byte is the length // of the accelerator, which is typically 0 (for no acceleration) but @@ -2084,28 +2021,19 @@ impl> StartTable { fn validate( &self, sp: &Special, - trans: &Transitions, + seen: &Seen, ) -> Result<(), DeserializeError> { for (id, _, _) in self.iter() { + if !seen.contains(&id) { + return Err(DeserializeError::generic( + "found invalid start state ID", + )); + } if sp.is_match_state(id) { return Err(DeserializeError::generic( "start states cannot be match states", )); } - // Confirm that the start state points to a valid state. - let state = trans.try_state(sp, id)?; - // And like for the transition table, confirm that the transitions - // on all start states themselves point to a valid state. - // - // It'd probably be better to integrate this validation with the - // transition table, or otherwise store a sorted sequence of all - // valid state IDs in the sparse DFA itself. That way, we could - // check that every pointer to a state corresponds precisely to a - // correct and valid state. - for i in 0..state.ntrans { - let to = state.next_at(i); - let _ = trans.try_state(sp, to)?; - } } Ok(()) } @@ -2145,28 +2073,27 @@ impl> StartTable { /// panics. fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; @@ -2561,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> { } } +// In order to validate everything, we not only need to make sure we +// can decode every state, but that every transition in every state +// points to a valid state. There are many duplicative transitions, so +// we record state IDs that we've verified so that we don't redo the +// decoding work. +// +// Except, when in no_std mode, we don't have dynamic memory allocation +// available to us, so we skip this optimization. It's not clear +// whether doing something more clever is worth it just yet. If you're +// profiling this code and need it to run faster, please file an issue. +// +// OK, so we also use this to record the set of valid state IDs. Since +// it is possible for a transition to point to an invalid state ID that +// still (somehow) deserializes to a valid state. So we need to make +// sure our transitions are limited to actually correct state IDs. +// The problem is, I'm not sure how to do this verification step in +// no-std no-alloc mode. I think we'd *have* to store the set of valid +// state IDs in the DFA itself. For now, we don't do this verification +// in no-std no-alloc mode. The worst thing that can happen is an +// incorrect result. But no panics or memory safety problems should +// result. Because we still do validate that the state itself is +// "valid" in the sense that everything it points to actually exists. +// +// ---AG +#[derive(Debug)] +struct Seen { + #[cfg(feature = "alloc")] + set: alloc::collections::BTreeSet, + #[cfg(not(feature = "alloc"))] + set: core::marker::PhantomData, +} + +#[cfg(feature = "alloc")] +impl Seen { + fn new() -> Seen { + Seen { set: alloc::collections::BTreeSet::new() } + } + fn insert(&mut self, id: StateID) { + self.set.insert(id); + } + fn contains(&self, id: &StateID) -> bool { + self.set.contains(id) + } +} + +#[cfg(not(feature = "alloc"))] +impl Seen { + fn new() -> Seen { + Seen { set: core::marker::PhantomData } + } + fn insert(&mut self, _id: StateID) {} + fn contains(&self, _id: &StateID) -> bool { + false + } +} + /* /// A binary search routine specialized specifically to a sparse DFA state's /// transitions. Specifically, the transitions are defined as a set of pairs diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 67261c1a30..bd9179b194 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -13,7 +13,7 @@ use alloc::vec::Vec; use crate::{ hybrid::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::{LazyStateID, LazyStateIDError}, search, }, @@ -28,7 +28,7 @@ use crate::{ Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, }, sparse_set::SparseSets, - start::{Start, StartByteMap}, + start::{self, Start, StartByteMap}, }, }; @@ -1518,8 +1518,8 @@ impl DFA { Lazy::new(self, cache).cache_next_state(current, unit) } - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this lazy DFA for the given + /// starting configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -1527,85 +1527,122 @@ impl DFA { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it + /// may be more succinct to use [`DFA::start_state_forward`] or + /// [`DFA::start_state_reverse`]. Note, for example, that the convenience + /// routines return a [`MatchError`] on failure where as this routine + /// returns a [`StartError`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte + /// or if the cache has become inefficient). This can also return an + /// error if the given configuration contains an unsupported [`Anchored`] + /// configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state( + &self, + cache: &mut Cache, + config: &start::Config, + ) -> Result { + let lazy = LazyRef::new(self, cache); + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.start_map.get(byte) + } + }; + let start_id = lazy.get_cached_start_id(anchored, start)?; + if !start_id.is_unknown() { + return Ok(start_id); + } + Lazy::new(self, cache).cache_start_group(anchored, start) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. + /// + /// # Errors + /// + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_forward(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.start()), + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.fwd(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Return the ID of the start state for this lazy DFA when executing a /// reverse search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_reverse(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.end()), + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.rev(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Returns the total number of patterns that match in this state. @@ -2066,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> { /// Here's an example that justifies 'inline(never)' /// /// ```ignore - /// regex-cli find hybrid dfa \ - /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000 + /// regex-cli find match hybrid \ + /// --cache-capacity 100000000 \ + /// -p '\pL{100}' + /// all-codepoints-utf8-100x /// ``` /// /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every @@ -2122,16 +2161,15 @@ impl<'i, 'c> Lazy<'i, 'c> { #[inline(never)] fn cache_start_group( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { - let mode = input.get_anchored(); - let nfa_start_id = match mode { + ) -> Result { + let nfa_start_id = match anchored { Anchored::No => self.dfa.get_nfa().start_unanchored(), Anchored::Yes => self.dfa.get_nfa().start_anchored(), Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } match self.dfa.get_nfa().start_pattern(pid) { None => return Ok(self.as_ref().dead_id()), @@ -2142,8 +2180,8 @@ impl<'i, 'c> Lazy<'i, 'c> { let id = self .cache_start_one(nfa_start_id, start) - .map_err(|_| MatchError::gave_up(input.start()))?; - self.set_start_state(input, start, id); + .map_err(StartError::cache)?; + self.set_start_state(anchored, start, id); Ok(id) } @@ -2574,13 +2612,13 @@ impl<'i, 'c> Lazy<'i, 'c> { /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); - let index = match input.get_anchored() { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { @@ -2642,17 +2680,16 @@ impl<'i, 'c> LazyRef<'i, 'c> { #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } if pid.as_usize() >= self.dfa.pattern_len() { return Ok(self.dead_id()); @@ -3178,12 +3215,12 @@ impl Config { /// be quit bytes _only_ when a Unicode word boundary is present in the /// pattern. /// - /// When enabling this option, callers _must_ be prepared to handle - /// a [`MatchError`](crate::MatchError) error during search. - /// When using a [`Regex`](crate::hybrid::regex::Regex), this - /// corresponds to using the `try_` suite of methods. Alternatively, - /// if callers can guarantee that their input is ASCII only, then a - /// [`MatchError::quit`] error will never be returned while searching. + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a + /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the + /// `try_` suite of methods. Alternatively, if callers can guarantee that + /// their input is ASCII only, then a [`MatchError::quit`] error will never + /// be returned while searching. /// /// This is disabled by default. /// @@ -3269,8 +3306,8 @@ impl Config { /// (The advantage being that non-ASCII quit bytes will only be added if a /// Unicode word boundary is in the pattern.) /// - /// When enabling this option, callers _must_ be prepared to handle a - /// [`MatchError`](crate::MatchError) error during search. When using a + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the /// `try_` suite of methods. /// @@ -3795,8 +3832,8 @@ impl Config { // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match hybrid --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quit.is_empty() { set.add_set(&quit); } diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index 604daf3c38..d134e7ec90 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -1,4 +1,4 @@ -use crate::{hybrid::id::LazyStateIDError, nfa}; +use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored}; /// An error that occurs when initial construction of a lazy DFA fails. /// @@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either +/// based on incorrect configuration or even based on whether +/// the look-behind byte triggers a quit state. Typically +/// one does not need to handle this error if you're using +/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward) +/// (or its reverse counterpart), as that routine automatically converts +/// `StartError` to a [`MatchError`](crate::MatchError) for you. +/// +/// This error may be returned by the +/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when cache inefficiency has dropped below the + /// configured heuristic thresholds. + Cache { + /// The underlying cache error that occurred. + err: CacheError, + }, + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn cache(err: CacheError) -> StartError { + StartError::Cache { err } + } + + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match *self { + StartError::Cache { ref err } => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Cache { .. } => write!( + f, + "error computing start state because of cache inefficiency" + ), + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// An error that occurs when cache usage has become inefficient. /// /// One of the weaknesses of a lazy DFA is that it may need to clear its @@ -126,11 +233,7 @@ impl CacheError { } #[cfg(feature = "std")] -impl std::error::Error for CacheError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - None - } -} +impl std::error::Error for CacheError {} impl core::fmt::Display for CacheError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { diff --git a/regex-automata/src/hybrid/mod.rs b/regex-automata/src/hybrid/mod.rs index 44e67e1299..2feb839d16 100644 --- a/regex-automata/src/hybrid/mod.rs +++ b/regex-automata/src/hybrid/mod.rs @@ -133,7 +133,7 @@ compiled DFAs. */ pub use self::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::LazyStateID, }; diff --git a/regex-automata/src/hybrid/regex.rs b/regex-automata/src/hybrid/regex.rs index 75667daf91..b3b1fe317d 100644 --- a/regex-automata/src/hybrid/regex.rs +++ b/regex-automata/src/hybrid/regex.rs @@ -878,7 +878,7 @@ impl Builder { } /// Set the lazy DFA compilation configuration for this builder using - /// [`dfa::Config`](dfa::Config). + /// [`dfa::Config`]. /// /// This permits setting things like whether Unicode word boundaries should /// be heuristically supported or settings how the behavior of the cache. diff --git a/regex-automata/src/hybrid/search.rs b/regex-automata/src/hybrid/search.rs index f232836854..1f4a505db4 100644 --- a/regex-automata/src/hybrid/search.rs +++ b/regex-automata/src/hybrid/search.rs @@ -105,14 +105,14 @@ fn find_fwd_imp( // PERF: For justification of omitting bounds checks, it gives us a // ~10% bump in search time. This was used for a benchmark: // - // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile // // PERF: For justification for the loop unrolling, we use a few // different tests: // - // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb - // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb - // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb + // regex-cli find half hybrid -p '\w{50}' -UBb bigfile + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile + // regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile // // And there are three different configurations: // @@ -353,7 +353,7 @@ fn find_rev_imp( // anchored and on shorter haystacks. However, this still makes a // difference. Take this command for example: // - // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb + // regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile // // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' // like in the justification for the forward direction. The 'regex' diff --git a/regex-automata/src/meta/limited.rs b/regex-automata/src/meta/limited.rs index 192a2625e4..5653adc9aa 100644 --- a/regex-automata/src/meta/limited.rs +++ b/regex-automata/src/meta/limited.rs @@ -69,9 +69,6 @@ pub(crate) fn dfa_try_search_half_rev( } else if dfa.is_dead_state(sid) { return Ok(mat); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -155,9 +152,6 @@ pub(crate) fn hybrid_try_search_half_rev( } else if sid.is_dead() { return Ok(mat); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -209,9 +203,6 @@ fn dfa_eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { @@ -246,9 +237,6 @@ fn hybrid_eoi_rev( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index ce3bae0fac..a06d2bb48c 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -3640,8 +3640,8 @@ mod tests { // I found this in the course of building out the benchmark suite for // rebar. #[test] - fn regression() { - env_logger::init(); + fn regression_suffix_literal_count() { + let _ = env_logger::try_init(); let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); assert_eq!(1, re.find_iter("tingling").count()); diff --git a/regex-automata/src/meta/stopat.rs b/regex-automata/src/meta/stopat.rs index e8d716689c..c4dcd797a0 100644 --- a/regex-automata/src/meta/stopat.rs +++ b/regex-automata/src/meta/stopat.rs @@ -81,9 +81,6 @@ pub(crate) fn dfa_try_search_half_fwd( } else if dfa.is_dead_state(sid) { return Ok(mat.ok_or(at)); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // Ideally we wouldn't use a DFA that specialized start states @@ -122,9 +119,6 @@ pub(crate) fn hybrid_try_search_half_fwd( } else if sid.is_dead() { return Ok(mat.ok_or(at)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // We should NEVER get an unknown state ID back from @@ -162,9 +156,6 @@ fn dfa_eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } @@ -201,9 +192,6 @@ fn hybrid_eoi_fwd( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 065e9ef270..2d2172957f 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1466,7 +1466,7 @@ impl Compiler { // compare and contrast performance of the Pike VM when the code below // is active vs the code above. Here's an example to try: // - // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru' + // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file // // With Unicode classes generated below, this search takes about 45s on // my machine. But with the compressed version above, the search takes @@ -1557,6 +1557,14 @@ impl Compiler { hir::Look::WordAsciiNegate => Look::WordAsciiNegate, hir::Look::WordUnicode => Look::WordUnicode, hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, + hir::Look::WordStartAscii => Look::WordStartAscii, + hir::Look::WordEndAscii => Look::WordEndAscii, + hir::Look::WordStartUnicode => Look::WordStartUnicode, + hir::Look::WordEndUnicode => Look::WordEndUnicode, + hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, + hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, + hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, + hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, }; let id = self.add_look(look)?; Ok(ThompsonRef { start: id, end: id }) diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs index c36ce53866..c92d4c0b8e 100644 --- a/regex-automata/src/nfa/thompson/map.rs +++ b/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 2108fa3381..1f57f8ebd9 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1841,14 +1841,12 @@ impl SparseTransitions { // This is an alternative implementation that uses binary search. In // some ad hoc experiments, like // - // smallishru=OpenSubtitles2018.raw.sample.smallish.ru - // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file // // I could not observe any improvement, and in fact, things seemed to // be a bit slower. I can see an improvement in at least one benchmark: // - // allcpssmall=all-codepoints-utf8-10x - // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}' + // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 // // Where total search time goes from 3.2s to 2.4s when using binary // search. diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 2d43a5b6f7..75c9b796b1 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index 30a82afb81..ba32991d06 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -145,9 +145,10 @@ pub(crate) fn next( } Some(_) => {} None => { - look_have = look_have.insert(Look::End); - look_have = look_have.insert(Look::EndLF); - look_have = look_have.insert(Look::EndCRLF); + look_have = look_have + .insert(Look::End) + .insert(Look::EndLF) + .insert(Look::EndCRLF); } } if unit.is_byte(lookm.get_line_terminator()) { @@ -160,11 +161,26 @@ pub(crate) fn next( look_have = look_have.insert(Look::StartCRLF); } if state.is_from_word() == unit.is_word_byte() { - look_have = look_have.insert(Look::WordUnicodeNegate); - look_have = look_have.insert(Look::WordAsciiNegate); + look_have = look_have + .insert(Look::WordAsciiNegate) + .insert(Look::WordUnicodeNegate); } else { - look_have = look_have.insert(Look::WordUnicode); - look_have = look_have.insert(Look::WordAscii); + look_have = + look_have.insert(Look::WordAscii).insert(Look::WordUnicode); + } + if !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndHalfAscii) + .insert(Look::WordEndHalfUnicode); + } + if state.is_from_word() && !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndAscii) + .insert(Look::WordEndUnicode); + } else if !state.is_from_word() && unit.is_word_byte() { + look_have = look_have + .insert(Look::WordStartAscii) + .insert(Look::WordStartUnicode); } // If we have new assertions satisfied that are among the set of // assertions that exist in this state (that is, just because we added @@ -220,6 +236,14 @@ pub(crate) fn next( { builder.set_look_have(|have| have.insert(Look::StartCRLF)); } + // And also for the start-half word boundary assertions. As long as the + // look-behind byte is not a word char, then the assertions are satisfied. + if nfa.look_set_any().contains_word() && !unit.is_word_byte() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } for nfa_id in sparses.set1.iter() { match *nfa.state(nfa_id) { thompson::State::Union { .. } @@ -563,47 +587,95 @@ pub(crate) fn set_lookbehind_from_start( ) { let rev = nfa.is_reverse(); let lineterm = nfa.look_matcher().get_line_terminator(); + let lookset = nfa.look_set_any(); match *start { - Start::NonWordByte => {} + Start::NonWordByte => { + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } + } Start::WordByte => { - builder.set_is_from_word(); + if lookset.contains_word() { + builder.set_is_from_word(); + } } Start::Text => { - builder.set_look_have(|have| { - have.insert(Look::Start) - .insert(Look::StartLF) - .insert(Look::StartCRLF) - }); + if lookset.contains_anchor_haystack() { + builder.set_look_have(|have| have.insert(Look::Start)); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| { + have.insert(Look::StartLF).insert(Look::StartCRLF) + }); + } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineLF => { if rev { - builder.set_is_half_crlf(); - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_crlf() { + builder.set_is_half_crlf(); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } } else { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } } - if lineterm == b'\n' { + if lookset.contains_anchor_line() && lineterm == b'\n' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineCR => { - if rev { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); - } else { - builder.set_is_half_crlf(); + if lookset.contains_anchor_crlf() { + if rev { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } else { + builder.set_is_half_crlf(); + } } - if lineterm == b'\r' { + if lookset.contains_anchor_line() && lineterm == b'\r' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::CustomLineTerminator => { - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } // This is a bit of a tricky case, but if the line terminator was // set to a word byte, then we also need to behave as if the start // configuration is Start::WordByte. That is, we need to mark our // state as having come from a word byte. - if utf8::is_word_byte(lineterm) { - builder.set_is_from_word(); + if lookset.contains_word() { + if utf8::is_word_byte(lineterm) { + builder.set_is_from_word(); + } else { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } } } diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index e641235874..effa6f44d7 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -197,7 +197,7 @@ impl StateBuilderEmpty { } pub(crate) fn into_matches(mut self) -> StateBuilderMatches { - self.0.extend_from_slice(&[0, 0, 0, 0, 0]); + self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]); StateBuilderMatches(self.0) } @@ -348,16 +348,17 @@ impl StateBuilderNFA { /// generated by a transition over a "word" byte. (Callers may not always set /// this. For example, if the NFA has no word boundary assertion, then needing /// to track whether a state came from a word byte or not is superfluous and -/// wasteful.) +/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition +/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is +/// enabled. /// -/// Byte 1 corresponds to the look-behind assertions that were satisfied by -/// the transition that created this state. This generally only includes the -/// StartLF and Start assertions. (Look-ahead assertions are not tracked as -/// part of states. Instead, these are applied by re-computing the epsilon -/// closure of a state when computing the transition function. See `next` in -/// the parent module.) +/// Bytes 1..5 correspond to the look-behind assertions that were satisfied +/// by the transition that created this state. (Look-ahead assertions are not +/// tracked as part of states. Instead, these are applied by re-computing the +/// epsilon closure of a state when computing the transition function. See +/// `next` in the parent module.) /// -/// Byte 2 corresponds to the set of look-around assertions (including both +/// Bytes 5..9 correspond to the set of look-around assertions (including both /// look-behind and look-ahead) that appear somewhere in this state's set of /// NFA state IDs. This is used to determine whether this state's epsilon /// closure should be re-computed when computing the transition function. @@ -366,7 +367,7 @@ impl StateBuilderNFA { /// function, we should only re-compute the epsilon closure if those new /// assertions are relevant to this particular state. /// -/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer +/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer /// corresponding to the number of patterns encoded in this state. If the state /// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is /// PatternID::ZERO, then no integer is encoded at this position. Instead, byte @@ -452,7 +453,7 @@ impl<'a> Repr<'a> { /// state has no conditional epsilon transitions, then there is no need /// to re-compute the epsilon closure. fn look_need(&self) -> LookSet { - LookSet::read_repr(&self.0[3..]) + LookSet::read_repr(&self.0[5..]) } /// Returns the total number of match pattern IDs in this state. @@ -476,7 +477,7 @@ impl<'a> Repr<'a> { if !self.has_pattern_ids() { PatternID::ZERO } else { - let offset = 9 + index * PatternID::SIZE; + let offset = 13 + index * PatternID::SIZE; // This is OK since we only ever serialize valid PatternIDs to // states. wire::read_pattern_id_unchecked(&self.0[offset..]).0 @@ -507,7 +508,7 @@ impl<'a> Repr<'a> { f(PatternID::ZERO); return; } - let mut pids = &self.0[9..self.pattern_offset_end()]; + let mut pids = &self.0[13..self.pattern_offset_end()]; while !pids.is_empty() { let pid = wire::read_u32(pids); pids = &pids[PatternID::SIZE..]; @@ -539,11 +540,11 @@ impl<'a> Repr<'a> { fn pattern_offset_end(&self) -> usize { let encoded = self.encoded_pattern_len(); if encoded == 0 { - return 5; + return 9; } // This arithmetic is OK since we were able to address this many bytes // when writing to the state, thus, it must fit into a usize. - encoded.checked_mul(4).unwrap().checked_add(9).unwrap() + encoded.checked_mul(4).unwrap().checked_add(13).unwrap() } /// Returns the total number of *encoded* pattern IDs in this state. @@ -557,7 +558,7 @@ impl<'a> Repr<'a> { } // This unwrap is OK since the total number of patterns is always // guaranteed to fit into a usize. - usize::try_from(wire::read_u32(&self.0[5..9])).unwrap() + usize::try_from(wire::read_u32(&self.0[9..13])).unwrap() } } @@ -643,7 +644,7 @@ impl<'a> ReprVec<'a> { /// Mutate the set of look-around (both behind and ahead) assertions that /// appear at least once in this state's set of NFA states. fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { - set(self.look_need()).write_repr(&mut self.0[3..]); + set(self.look_need()).write_repr(&mut self.0[5..]); } /// Add a pattern ID to this state. All match states must have at least @@ -703,14 +704,14 @@ impl<'a> ReprVec<'a> { return; } let patsize = PatternID::SIZE; - let pattern_bytes = self.0.len() - 9; + let pattern_bytes = self.0.len() - 13; // Every pattern ID uses 4 bytes, so number of bytes should be // divisible by 4. assert_eq!(pattern_bytes % patsize, 0); // This unwrap is OK since we are guaranteed that the maximum number // of possible patterns fits into a u32. let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); - wire::NE::write_u32(count32, &mut self.0[5..9]); + wire::NE::write_u32(count32, &mut self.0[9..13]); } /// Add an NFA state ID to this state. The order in which NFA states are diff --git a/regex-automata/src/util/lazy.rs b/regex-automata/src/util/lazy.rs index de27a2a6e6..0d0b4fb2ae 100644 --- a/regex-automata/src/util/lazy.rs +++ b/regex-automata/src/util/lazy.rs @@ -384,11 +384,7 @@ mod lazy { // SAFETY: state is DONE if and only if data has been fully // initialized. At which point, it is safe to drop. unsafe { - // MSRV(1.60): Use assume_init_drop. The below is how - // assume_init_drop is implemented. - core::ptr::drop_in_place( - (*self.data.as_ptr()).as_mut_ptr(), - ) + self.data.get_mut().assume_init_drop(); } } } diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index aee31b34e0..73e51c0f6e 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -96,6 +96,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -117,6 +153,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -125,28 +169,36 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option { + pub const fn from_repr(repr: u32) -> Option { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -171,6 +223,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -184,14 +244,14 @@ impl Look { pub struct LookSet { /// The underlying representation this set is exposed to make it possible /// to store it somewhere efficiently. The representation is that - /// of a bitset, where each assertion occupies bit `i` where `i = - /// Look::as_repr()`. + /// of a bitset, where each assertion occupies bit `i` where + /// `i = Look::as_repr()`. /// /// Note that users of this internal representation must permit the full /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -294,13 +354,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -379,29 +448,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } /// Checks that all assertions in this set can be matched. @@ -456,9 +527,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } @@ -566,6 +637,23 @@ impl LookMatcher { } /// Like `matches`, but forcefully inlined. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn matches_inline( &self, @@ -586,6 +674,26 @@ impl LookMatcher { Look::WordUnicodeNegate => { self.is_word_unicode_negate(haystack, at).unwrap() } + Look::WordStartAscii => self.is_word_start_ascii(haystack, at), + Look::WordEndAscii => self.is_word_end_ascii(haystack, at), + Look::WordStartUnicode => { + self.is_word_start_unicode(haystack, at).unwrap() + } + Look::WordEndUnicode => { + self.is_word_end_unicode(haystack, at).unwrap() + } + Look::WordStartHalfAscii => { + self.is_word_start_half_ascii(haystack, at) + } + Look::WordEndHalfAscii => { + self.is_word_end_half_ascii(haystack, at) + } + Look::WordStartHalfUnicode => { + self.is_word_start_half_unicode(haystack, at).unwrap() + } + Look::WordEndHalfUnicode => { + self.is_word_end_half_unicode(haystack, at).unwrap() + } } } @@ -680,6 +788,46 @@ impl LookMatcher { return false; } } + if set.contains(Look::WordStartAscii) { + if !self.is_word_start_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndAscii) { + if !self.is_word_end_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartUnicode) { + if !self.is_word_start_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndUnicode) { + if !self.is_word_end_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordStartHalfAscii) { + if !self.is_word_start_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndHalfAscii) { + if !self.is_word_end_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartHalfUnicode) { + if !self.is_word_start_half_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndHalfUnicode) { + if !self.is_word_end_half_unicode(haystack, at).unwrap() { + return false; + } + } true } @@ -703,7 +851,15 @@ impl LookMatcher { Look::WordAscii | Look::WordAsciiNegate | Look::WordUnicode - | Look::WordUnicodeNegate => { + | Look::WordUnicodeNegate + | Look::WordStartAscii + | Look::WordEndAscii + | Look::WordStartUnicode + | Look::WordEndUnicode + | Look::WordStartHalfAscii + | Look::WordEndHalfAscii + | Look::WordStartHalfUnicode + | Look::WordEndHalfUnicode => { // We need to mark all ranges of bytes whose pairs result in // evaluating \b differently. This isn't technically correct // for Unicode word boundaries, but DFAs can't handle those @@ -931,6 +1087,177 @@ impl LookMatcher { }; Ok(word_before == word_after) } + + /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + + /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + + /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(!word_before && word_after) + } + + /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(word_before && !word_after) + } + + /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_half_ascii( + &self, + haystack: &[u8], + at: usize, + ) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + + /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } + + /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_start_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the right side must be in \w. + let word_before = at > 0 + && match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::rev(haystack, at)?, + }; + Ok(!word_before) + } + + /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_end_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the left side must be in \w. + let word_after = at < haystack.len() + && match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::fwd(haystack, at)?, + }; + Ok(!word_after) + } } impl Default for LookMatcher { @@ -1024,7 +1351,9 @@ impl core::fmt::Display for UnicodeWordBoundaryError { // There are perhaps other choices as well. Why did I stop at these 4? Because // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA // approach eventually, as the benefits of the DFA approach are somewhat -// compelling. The 'boundary-words-holmes' benchmark tests this: +// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that +// the commands below no longer work. If necessary, we should re-capitulate +// the benchmark from whole cloth in rebar.) // // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv // @@ -1322,8 +1651,7 @@ mod is_word_char { fn is_word_character(c: char) -> bool { use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; - // MSRV(1.59): Use 'u8::try_from(c)' instead. - if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) { + if u8::try_from(c).map_or(false, utf8::is_word_byte) { return true; } PERL_WORD @@ -1656,50 +1984,478 @@ mod tests { } #[test] - fn look_set() { - let mut f = LookSet::default(); - assert!(!f.contains(Look::Start)); - assert!(!f.contains(Look::End)); - assert!(!f.contains(Look::StartLF)); - assert!(!f.contains(Look::EndLF)); - assert!(!f.contains(Look::WordUnicode)); - assert!(!f.contains(Look::WordUnicodeNegate)); - assert!(!f.contains(Look::WordAscii)); - assert!(!f.contains(Look::WordAsciiNegate)); + fn look_matches_word_start_ascii() { + let look = Look::WordStartAscii; - f = f.insert(Look::Start); - assert!(f.contains(Look::Start)); - f = f.remove(Look::Start); - assert!(!f.contains(Look::Start)); + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) - f = f.insert(Look::End); - assert!(f.contains(Look::End)); - f = f.remove(Look::End); - assert!(!f.contains(Look::End)); + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); - f = f.insert(Look::StartLF); - assert!(f.contains(Look::StartLF)); - f = f.remove(Look::StartLF); - assert!(!f.contains(Look::StartLF)); + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); - f = f.insert(Look::EndLF); - assert!(f.contains(Look::EndLF)); - f = f.remove(Look::EndLF); - assert!(!f.contains(Look::EndLF)); + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); - f = f.insert(Look::StartCRLF); - assert!(f.contains(Look::StartCRLF)); - f = f.remove(Look::StartCRLF); - assert!(!f.contains(Look::StartCRLF)); + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); - f = f.insert(Look::EndCRLF); - assert!(f.contains(Look::EndCRLF)); - f = f.remove(Look::EndCRLF); - assert!(!f.contains(Look::EndCRLF)); + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); - f = f.insert(Look::WordUnicode); - assert!(f.contains(Look::WordUnicode)); - f = f.remove(Look::WordUnicode); + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_ascii() { + let look = Look::WordEndAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_unicode() { + let look = Look::WordStartUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_unicode() { + let look = Look::WordEndUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_start_half_ascii() { + let look = Look::WordStartHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_half_ascii() { + let look = Look::WordEndHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_half_unicode() { + let look = Look::WordStartHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_half_unicode() { + let look = Look::WordEndHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::Start)); + assert!(!f.contains(Look::End)); + assert!(!f.contains(Look::StartLF)); + assert!(!f.contains(Look::EndLF)); + assert!(!f.contains(Look::WordUnicode)); + assert!(!f.contains(Look::WordUnicodeNegate)); + assert!(!f.contains(Look::WordAscii)); + assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::Start); + assert!(f.contains(Look::Start)); + f = f.remove(Look::Start); + assert!(!f.contains(Look::Start)); + + f = f.insert(Look::End); + assert!(f.contains(Look::End)); + f = f.remove(Look::End); + assert!(!f.contains(Look::End)); + + f = f.insert(Look::StartLF); + assert!(f.contains(Look::StartLF)); + f = f.remove(Look::StartLF); + assert!(!f.contains(Look::StartLF)); + + f = f.insert(Look::EndLF); + assert!(f.contains(Look::EndLF)); + f = f.remove(Look::EndLF); + assert!(!f.contains(Look::EndLF)); + + f = f.insert(Look::StartCRLF); + assert!(f.contains(Look::StartCRLF)); + f = f.remove(Look::StartCRLF); + assert!(!f.contains(Look::StartCRLF)); + + f = f.insert(Look::EndCRLF); + assert!(f.contains(Look::EndCRLF)); + f = f.remove(Look::EndCRLF); + assert!(!f.contains(Look::EndCRLF)); + + f = f.insert(Look::WordUnicode); + assert!(f.contains(Look::WordUnicode)); + f = f.remove(Look::WordUnicode); assert!(!f.contains(Look::WordUnicode)); f = f.insert(Look::WordUnicodeNegate); @@ -1716,6 +2472,46 @@ mod tests { assert!(f.contains(Look::WordAsciiNegate)); f = f.remove(Look::WordAsciiNegate); assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::WordStartAscii); + assert!(f.contains(Look::WordStartAscii)); + f = f.remove(Look::WordStartAscii); + assert!(!f.contains(Look::WordStartAscii)); + + f = f.insert(Look::WordEndAscii); + assert!(f.contains(Look::WordEndAscii)); + f = f.remove(Look::WordEndAscii); + assert!(!f.contains(Look::WordEndAscii)); + + f = f.insert(Look::WordStartUnicode); + assert!(f.contains(Look::WordStartUnicode)); + f = f.remove(Look::WordStartUnicode); + assert!(!f.contains(Look::WordStartUnicode)); + + f = f.insert(Look::WordEndUnicode); + assert!(f.contains(Look::WordEndUnicode)); + f = f.remove(Look::WordEndUnicode); + assert!(!f.contains(Look::WordEndUnicode)); + + f = f.insert(Look::WordStartHalfAscii); + assert!(f.contains(Look::WordStartHalfAscii)); + f = f.remove(Look::WordStartHalfAscii); + assert!(!f.contains(Look::WordStartHalfAscii)); + + f = f.insert(Look::WordEndHalfAscii); + assert!(f.contains(Look::WordEndHalfAscii)); + f = f.remove(Look::WordEndHalfAscii); + assert!(!f.contains(Look::WordEndHalfAscii)); + + f = f.insert(Look::WordStartHalfUnicode); + assert!(f.contains(Look::WordStartHalfUnicode)); + f = f.remove(Look::WordStartHalfUnicode); + assert!(!f.contains(Look::WordStartHalfUnicode)); + + f = f.insert(Look::WordEndHalfUnicode); + assert!(f.contains(Look::WordEndHalfUnicode)); + f = f.remove(Look::WordEndHalfUnicode); + assert!(!f.contains(Look::WordEndHalfUnicode)); } #[test] @@ -1724,7 +2520,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -1735,6 +2531,9 @@ mod tests { let set = LookSet::empty().insert(Look::WordAsciiNegate); assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordEndHalfUnicode); + assert_eq!(1, set.iter().count()); } #[test] @@ -1743,6 +2542,6 @@ mod tests { let res = alloc::format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = alloc::format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/regex-automata/src/util/mod.rs b/regex-automata/src/util/mod.rs index bb739df1df..b3eef64e64 100644 --- a/regex-automata/src/util/mod.rs +++ b/regex-automata/src/util/mod.rs @@ -40,6 +40,7 @@ pub mod look; pub mod pool; pub mod prefilter; pub mod primitives; +pub mod start; #[cfg(feature = "syntax")] pub mod syntax; pub mod wire; @@ -52,6 +53,5 @@ pub(crate) mod memchr; pub(crate) mod search; #[cfg(feature = "alloc")] pub(crate) mod sparse_set; -pub(crate) mod start; pub(crate) mod unicode_data; pub(crate) mod utf8; diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs index 95afa4a0d8..d90d4ecffa 100644 --- a/regex-automata/src/util/pool.rs +++ b/regex-automata/src/util/pool.rs @@ -455,11 +455,44 @@ mod inner { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub(super) fn new(create: F) -> Pool { - // MSRV(1.63): Mark this function as 'const'. I've arranged the - // code such that it should "just work." Then mark the public - // 'Pool::new' method as 'const' too. (The alloc-only Pool::new - // is already 'const', so that should "just work" too.) The only - // thing we're waiting for is Mutex::new to be const. + // FIXME: Now that we require 1.65+, Mutex::new is available as + // const... So we can almost mark this function as const. But of + // course, we're creating a Vec of stacks below (we didn't when I + // originally wrote this code). It seems like the best way to work + // around this would be to use a `[Stack; MAX_POOL_STACKS]` instead + // of a `Vec`. I refrained from making this change at time + // of writing (2023/10/08) because I was making a lot of other + // changes at the same time and wanted to do this more carefully. + // Namely, because of the cache line optimization, that `[Stack; + // MAX_POOL_STACKS]` would be quite big. It's unclear how bad (if + // at all) that would be. + // + // Another choice would be to lazily allocate the stacks, but... + // I'm not so sure about that. Seems like a fair bit of complexity? + // + // Maybe there's a simple solution I'm missing. + // + // ... OK, I tried to fix this. First, I did it by putting `stacks` + // in an `UnsafeCell` and using a `Once` to lazily initialize it. + // I benchmarked it and everything looked okay. I then made this + // function `const` and thought I was just about done. But the + // public pool type wraps its inner pool in a `Box` to keep its + // size down. Blech. + // + // So then I thought that I could push the box down into this + // type (and leave the non-std version unboxed) and use the same + // `UnsafeCell` technique to lazily initialize it. This has the + // downside of the `Once` now needing to get hit in the owner fast + // path, but maybe that's OK? However, I then realized that we can + // only lazily initialize `stacks`, `owner` and `owner_val`. The + // `create` function needs to be put somewhere outside of the box. + // So now the pool is a `Box`, `Once` and a function. Now we're + // starting to defeat the point of boxing in the first place. So I + // backed out that change too. + // + // Back to square one. I maybe we just don't make a pool's + // constructor const and live with it. It's probably not a huge + // deal. let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); for _ in 0..stacks.capacity() { stacks.push(CacheLine(Mutex::new(vec![]))); diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index 4e360d083a..27153780ec 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -1,17 +1,195 @@ /*! -Provides some helpers for dealing with start state configurations in DFAs. - -[`Start`] represents the possible starting configurations, while -[`StartByteMap`] represents a way to retrieve the `Start` configuration for a -given position in a haystack. +Provides helpers for dealing with start state configurations in DFAs. */ use crate::util::{ look::LookMatcher, - search::Input, + search::{Anchored, Input}, wire::{self, DeserializeError, SerializeError}, }; +/// The configuration used to determine a DFA's start state for a search. +/// +/// A DFA has a single starting state in the typical textbook description. That +/// is, it corresponds to the set of all starting states for the NFA that built +/// it, along with their espsilon closures. In this crate, however, DFAs have +/// many possible start states due to a few factors: +/// +/// * DFAs support the ability to run either anchored or unanchored searches. +/// Each type of search needs its own start state. For example, an unanchored +/// search requires starting at a state corresponding to a regex with a +/// `(?s-u:.)*?` prefix, which will match through anything. +/// * DFAs also optionally support starting an anchored search for any one +/// specific pattern. Each such pattern requires its own start state. +/// * If a look-behind assertion like `^` or `\b` is used in the regex, then +/// the DFA will need to inspect a single byte immediately before the start of +/// the search to choose the correct start state. +/// +/// Indeed, this configuration precisely encapsulates all of the above factors. +/// The [`Config::anchored`] method sets which kind of anchored search to +/// perform while the [`Config::look_behind`] method provides a way to set +/// the byte that occurs immediately before the start of the search. +/// +/// Generally speaking, this type is only useful when you want to run searches +/// without using an [`Input`]. In particular, an `Input` wants a haystack +/// slice, but callers may not have a contiguous sequence of bytes as a +/// haystack in all cases. This type provides a lower level of control such +/// that callers can provide their own anchored configuration and look-behind +/// byte explicitly. +/// +/// # Example +/// +/// This shows basic usage that permits running a search with a DFA without +/// using the `Input` abstraction. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter() { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// This example shows how to correctly run a search that doesn't begin at +/// the start of a haystack. Notice how we set the look-behind byte, and as +/// a result, the `\b` assertion does not match. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new() +/// .anchored(Anchored::Yes) +/// .look_behind(Some(b'q')); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // No match! +/// assert!(!dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// If we had instead not set a look-behind byte, then the DFA would assume +/// that it was starting at the beginning of the haystack, and thus `\b` should +/// match. This in turn would result in erroneously reporting a match: +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// // Whoops, forgot the look-behind byte... +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // And now we get a match unexpectedly. +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + look_behind: Option, + anchored: Anchored, +} + +impl Config { + /// Create a new default start configuration. + /// + /// The default is an unanchored search that starts at the beginning of the + /// haystack. + pub fn new() -> Config { + Config { anchored: Anchored::No, look_behind: None } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a forward search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// preceding the start of the search. If the start of the search is at + /// offset `0`, then no look-behind byte is set. + pub fn from_input_forward(input: &Input<'_>) -> Config { + let look_behind = input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i).copied()); + Config { look_behind, anchored: input.get_anchored() } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a reverse search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// following the end of the search. If the end of the search is at + /// offset `haystack.len()`, then no look-behind byte is set. + pub fn from_input_reverse(input: &Input<'_>) -> Config { + let look_behind = input.haystack().get(input.end()).copied(); + Config { look_behind, anchored: input.get_anchored() } + } + + /// Set the look-behind byte at the start of a search. + /// + /// Unless the search is intended to logically start at the beginning of a + /// haystack, this should _always_ be set to the byte immediately preceding + /// the start of the search. If no look-behind byte is set, then the start + /// configuration will assume it is at the beginning of the haystack. For + /// example, the anchor `^` will match. + /// + /// The default is that no look-behind byte is set. + pub fn look_behind(mut self, byte: Option) -> Config { + self.look_behind = byte; + self + } + + /// Set the anchored mode of a search. + /// + /// The default is an unanchored search. + pub fn anchored(mut self, mode: Anchored) -> Config { + self.anchored = mode; + self + } + + /// Return the look-behind byte in this configuration, if one exists. + pub fn get_look_behind(&self) -> Option { + self.look_behind + } + + /// Return the anchored mode in this configuration. + pub fn get_anchored(&self) -> Anchored { + self.anchored + } +} + /// A map from every possible byte value to its corresponding starting /// configuration. /// @@ -71,30 +249,11 @@ impl StartByteMap { StartByteMap { map } } - /// Return the forward starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn fwd(&self, input: &Input) -> Start { - match input - .start() - .checked_sub(1) - .and_then(|i| input.haystack().get(i)) - { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - - /// Return the reverse starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn rev(&self, input: &Input) -> Start { - match input.haystack().get(input.end()) { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - + /// Return the starting configuration for the given look-behind byte. + /// + /// If no look-behind exists, callers should use `Start::Text`. #[cfg_attr(feature = "perf-inline", inline(always))] - fn get(&self, byte: u8) -> Start { + pub(crate) fn get(&self, byte: u8) -> Start { self.map[usize::from(byte)] } @@ -253,21 +412,32 @@ mod tests { #[test] fn start_fwd_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_rev_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_fwd() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.fwd(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); @@ -287,8 +457,11 @@ mod tests { fn start_rev() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.rev(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index f3445e02a4..8ed6dd0077 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -9,7 +9,6 @@ use { util::{prefilter::Prefilter, syntax}, Anchored, Input, PatternSet, }, - regex_syntax::hir, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, @@ -285,10 +284,7 @@ fn compiler( // That is, Unicode word boundaries when searching non-ASCII text. if !test.haystack().is_ascii() { for hir in hirs.iter() { - let looks = hir.properties().look_set(); - if looks.contains(hir::Look::WordUnicode) - || looks.contains(hir::Look::WordUnicodeNegate) - { + if hir.properties().look_set().contains_word_unicode() { return Ok(CompiledRegex::skip()); } } diff --git a/regex-automata/tests/hybrid/api.rs b/regex-automata/tests/hybrid/api.rs index e82d808e34..4b04c4f8fd 100644 --- a/regex-automata/tests/hybrid/api.rs +++ b/regex-automata/tests/hybrid/api.rs @@ -55,7 +55,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { let mut cache = dfa.create_cache(); let haystack = "a".repeat(101).into_bytes(); - let err = MatchError::gave_up(25); + let err = MatchError::gave_up(24); // Notice that we make the same amount of progress in each search! That's // because the cache is reused and already has states to handle the first // N bytes. @@ -83,7 +83,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { // OK, if we reset the cache, then we should be able to create more states // and make more progress with searching for betas. cache.reset(&dfa); - let err = MatchError::gave_up(27); + let err = MatchError::gave_up(26); assert_eq!( Err(err), dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) diff --git a/regex-automata/tests/lib.rs b/regex-automata/tests/lib.rs index 1465e51eb7..67c979aa8d 100644 --- a/regex-automata/tests/lib.rs +++ b/regex-automata/tests/lib.rs @@ -61,6 +61,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index f9dec00242..b5de2b5e78 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -12,6 +12,7 @@ license = "MIT OR Apache-2.0" categories = ["text-processing"] autotests = false edition = "2021" +rust-version = "1.65" [[bin]] name = "regex-cli" diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index c0ab1b361c..c287f6f527 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -404,7 +404,9 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(_) => 0, + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + | Ast::ClassBracketed(_) => 0, Ast::Repetition(ref rep) => count_capturing_groups_ast(&*rep.ast), Ast::Group(ref group) => { let this = if group.is_capturing() { 1 } else { 0 }; diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index 1dc144b316..21330fd4ec 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -10,7 +10,7 @@ A lightweight regex engine that optimizes for binary size and compilation time. """ workspace = ".." edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" autotests = false # Features are documented in the "Crate features" section of the crate docs: diff --git a/regex-lite/README.md b/regex-lite/README.md index 34c749b216..758fac6aea 100644 --- a/regex-lite/README.md +++ b/regex-lite/README.md @@ -78,7 +78,7 @@ year: 2014, month: 10, day: 14 ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.60.0`. +This crate's minimum supported `rustc` version is `1.65.0`. The policy is that the minimum Rust version required to use this crate can be increased in semver compatible updates. diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs index f73a5420ab..3d61ce8c9e 100644 --- a/regex-lite/src/hir/mod.rs +++ b/regex-lite/src/hir/mod.rs @@ -592,6 +592,24 @@ pub(crate) enum Look { Word = 1 << 6, /// Match an ASCII-only negation of a word boundary. WordNegate = 1 << 7, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStart = 1 << 8, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEnd = 1 << 9, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalf = 1 << 10, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalf = 1 << 11, } impl Look { @@ -631,6 +649,30 @@ impl Look { at < haystack.len() && utf8::is_word_byte(haystack[at]); word_before == word_after } + WordStart => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + WordEnd => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + WordStartHalf => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + WordEndHalf => { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } } } } diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs index cc3c21fe63..33bb97a7d8 100644 --- a/regex-lite/src/hir/parse.rs +++ b/regex-lite/src/hir/parse.rs @@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = "character class difference is not supported"; const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = "character class symmetric difference is not supported"; +const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str = + "special word boundary assertion is unclosed or has an invalid character"; +const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str = + "special word boundary assertion is unrecognized"; +const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str = + "found start of special word boundary or repetition without an end"; /// A regular expression parser. /// @@ -479,12 +485,86 @@ impl<'a> Parser<'a> { 'v' => special('\x0B'), 'A' => Ok(Hir::look(hir::Look::Start)), 'z' => Ok(Hir::look(hir::Look::End)), - 'b' => Ok(Hir::look(hir::Look::Word)), + 'b' => { + let mut hir = Hir::look(hir::Look::Word); + if !self.is_done() && self.char() == '{' { + if let Some(special) = + self.maybe_parse_special_word_boundary()? + { + hir = special; + } + } + Ok(hir) + } 'B' => Ok(Hir::look(hir::Look::WordNegate)), + '<' => Ok(Hir::look(hir::Look::WordStart)), + '>' => Ok(Hir::look(hir::Look::WordEnd)), _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), } } + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary(&self) -> Result, Error> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF)); + } + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.pos.set(start); + self.char.set(Some('{')); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = String::new(); + while !self.is_done() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_done() || self.char() != '}' { + return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED)); + } + self.bump(); + let kind = match scratch.as_str() { + "start" => hir::Look::WordStart, + "end" => hir::Look::WordEnd, + "start-half" => hir::Look::WordStartHalf, + "end-half" => hir::Look::WordEndHalf, + _ => { + return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED)) + } + }; + Ok(Some(Hir::look(kind))) + } + /// Parse a hex representation of a Unicode codepoint. This handles both /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to @@ -1948,8 +2028,6 @@ bar assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL")); assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}")); assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i")); - assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<")); - assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+")); @@ -1983,6 +2061,11 @@ bar assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]")); assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]")); assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}")); + assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{")); + assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ ")); } #[test] diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs index 8008b9e597..68d54824ff 100644 --- a/regex-lite/src/lib.rs +++ b/regex-lite/src/lib.rs @@ -466,12 +466,16 @@ x{n}? exactly n x ### Empty matches
-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    an ASCII word boundary (\w on one side and \W, \A, or \z on other)
-\B    not an ASCII word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              an ASCII word boundary (\w on one side and \W, \A, or \z on other)
+\B              not an ASCII word boundary
+\b{start}       an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}         an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of an ASCII start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of an ASCII end-of-word boundary (\W|\z on the right)
 
The empty regex is valid and matches the empty string. For example, the @@ -581,25 +585,29 @@ Note that this includes all possible escape sequences, even ones that are documented elsewhere.
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 
### Perl character classes (ASCII only) diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs index 1c6eb4ab93..af0a5b6298 100644 --- a/regex-lite/src/string.rs +++ b/regex-lite/src/string.rs @@ -2063,6 +2063,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` +/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); diff --git a/regex-lite/tests/lib.rs b/regex-lite/tests/lib.rs index 757b394411..89635f2d78 100644 --- a/regex-lite/tests/lib.rs +++ b/regex-lite/tests/lib.rs @@ -38,6 +38,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index aaceeee7f5..e5e541302a 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -8,7 +8,7 @@ documentation = "https://docs.rs/regex-syntax" description = "A regular expression parser." workspace = ".." edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9e4284fee8..6a77ee1343 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -162,6 +162,18 @@ pub enum ErrorKind { /// `(?i)*`. It is, however, possible to create a repetition operating on /// an empty sub-expression. For example, `()*` is still considered valid. RepetitionMissing, + /// The special word boundary syntax, `\b{something}`, was used, but + /// either EOF without `}` was seen, or an invalid character in the + /// braces was seen. + SpecialWordBoundaryUnclosed, + /// The special word boundary syntax, `\b{something}`, was used, but + /// `something` was not recognized as a valid word boundary kind. + SpecialWordBoundaryUnrecognized, + /// The syntax `\b{` was observed, but afterwards the end of the pattern + /// was observed without being able to tell whether it was meant to be a + /// bounded repetition on the `\b` or the beginning of a special word + /// boundary assertion. + SpecialWordOrRepetitionUnexpectedEof, /// The Unicode class is not valid. This typically occurs when a `\p` is /// followed by something other than a `{`. UnicodeClassInvalid, @@ -260,6 +272,29 @@ impl core::fmt::Display for ErrorKind { RepetitionMissing => { write!(f, "repetition operator missing expression") } + SpecialWordBoundaryUnclosed => { + write!( + f, + "special word boundary assertion is either \ + unclosed or contains an invalid character", + ) + } + SpecialWordBoundaryUnrecognized => { + write!( + f, + "unrecognized special word boundary assertion, \ + valid choices are: start, end, start-half \ + or end-half", + ) + } + SpecialWordOrRepetitionUnexpectedEof => { + write!( + f, + "found either the beginning of a special word \ + boundary or a bounded repetition on a \\b with \ + an opening brace, but no closing brace", + ) + } UnicodeClassInvalid => { write!(f, "invalid Unicode character class") } @@ -433,29 +468,94 @@ pub struct Comment { #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum Ast { /// An empty regex that matches everything. - Empty(Span), + Empty(Box), /// A set of flags, e.g., `(?is)`. - Flags(SetFlags), + Flags(Box), /// A single character literal, which includes escape sequences. - Literal(Literal), + Literal(Box), /// The "any character" class. - Dot(Span), + Dot(Box), /// A single zero-width assertion. - Assertion(Assertion), - /// A single character class. This includes all forms of character classes - /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. - Class(Class), + Assertion(Box), + /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. + ClassUnicode(Box), + /// A single perl character class, e.g., `\d` or `\W`. + ClassPerl(Box), + /// A single bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + ClassBracketed(Box), /// A repetition operator applied to an arbitrary regular expression. - Repetition(Repetition), + Repetition(Box), /// A grouped regular expression. - Group(Group), + Group(Box), /// An alternation of regular expressions. - Alternation(Alternation), + Alternation(Box), /// A concatenation of regular expressions. - Concat(Concat), + Concat(Box), } impl Ast { + /// Create an "empty" AST item. + pub fn empty(span: Span) -> Ast { + Ast::Empty(Box::new(span)) + } + + /// Create a "flags" AST item. + pub fn flags(e: SetFlags) -> Ast { + Ast::Flags(Box::new(e)) + } + + /// Create a "literal" AST item. + pub fn literal(e: Literal) -> Ast { + Ast::Literal(Box::new(e)) + } + + /// Create a "dot" AST item. + pub fn dot(span: Span) -> Ast { + Ast::Dot(Box::new(span)) + } + + /// Create a "assertion" AST item. + pub fn assertion(e: Assertion) -> Ast { + Ast::Assertion(Box::new(e)) + } + + /// Create a "Unicode class" AST item. + pub fn class_unicode(e: ClassUnicode) -> Ast { + Ast::ClassUnicode(Box::new(e)) + } + + /// Create a "Perl class" AST item. + pub fn class_perl(e: ClassPerl) -> Ast { + Ast::ClassPerl(Box::new(e)) + } + + /// Create a "bracketed class" AST item. + pub fn class_bracketed(e: ClassBracketed) -> Ast { + Ast::ClassBracketed(Box::new(e)) + } + + /// Create a "repetition" AST item. + pub fn repetition(e: Repetition) -> Ast { + Ast::Repetition(Box::new(e)) + } + + /// Create a "group" AST item. + pub fn group(e: Group) -> Ast { + Ast::Group(Box::new(e)) + } + + /// Create a "alternation" AST item. + pub fn alternation(e: Alternation) -> Ast { + Ast::Alternation(Box::new(e)) + } + + /// Create a "concat" AST item. + pub fn concat(e: Concat) -> Ast { + Ast::Concat(Box::new(e)) + } + /// Return the span of this abstract syntax tree. pub fn span(&self) -> &Span { match *self { @@ -464,7 +564,9 @@ impl Ast { Ast::Literal(ref x) => &x.span, Ast::Dot(ref span) => span, Ast::Assertion(ref x) => &x.span, - Ast::Class(ref x) => x.span(), + Ast::ClassUnicode(ref x) => &x.span, + Ast::ClassPerl(ref x) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, @@ -488,8 +590,10 @@ impl Ast { | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) - | Ast::Assertion(_) => false, - Ast::Class(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => false, + Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) | Ast::Alternation(_) @@ -526,14 +630,14 @@ pub struct Alternation { impl Alternation { /// Return this alternation as an AST. /// - /// If this alternation contains zero ASTs, then Ast::Empty is - /// returned. If this alternation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::alternation` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Alternation(self), + _ => Ast::alternation(self), } } } @@ -551,14 +655,14 @@ pub struct Concat { impl Concat { /// Return this concatenation as an AST. /// - /// If this concatenation contains zero ASTs, then Ast::Empty is - /// returned. If this concatenation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Concat is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::concat` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Concat(self), + _ => Ast::concat(self), } } } @@ -675,31 +779,6 @@ impl HexLiteralKind { } } -/// A single character class expression. -#[derive(Clone, Debug, Eq, PartialEq)] -#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub enum Class { - /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. - Unicode(ClassUnicode), - /// A perl character class, e.g., `\d` or `\W`. - Perl(ClassPerl), - /// A bracketed character class set, which may contain zero or more - /// character ranges and/or zero or more nested classes. e.g., - /// `[a-zA-Z\pL]`. - Bracketed(ClassBracketed), -} - -impl Class { - /// Return the span of this character class. - pub fn span(&self) -> &Span { - match *self { - Class::Perl(ref x) => &x.span, - Class::Unicode(ref x) => &x.span, - Class::Bracketed(ref x) => &x.span, - } - } -} - /// A Perl character class. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] @@ -1249,6 +1328,18 @@ pub enum AssertionKind { WordBoundary, /// `\B` NotWordBoundary, + /// `\b{start}` + WordBoundaryStart, + /// `\b{end}` + WordBoundaryEnd, + /// `\<` (alias for `\b{start}`) + WordBoundaryStartAngle, + /// `\>` (alias for `\b{end}`) + WordBoundaryEndAngle, + /// `\b{start-half}` + WordBoundaryStartHalf, + /// `\b{end-half}` + WordBoundaryEndHalf, } /// A repetition operation applied to a regular expression. @@ -1550,8 +1641,10 @@ impl Drop for Ast { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => return, + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + // Bracketed classes are recursive, they get their own Drop impl. + | Ast::ClassBracketed(_) => return, Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, Ast::Group(ref x) if !x.ast.has_subexprs() => return, Ast::Alternation(ref x) if x.asts.is_empty() => return, @@ -1560,7 +1653,7 @@ impl Drop for Ast { } let empty_span = || Span::splat(Position::new(0, 0, 0)); - let empty_ast = || Ast::Empty(empty_span()); + let empty_ast = || Ast::empty(empty_span()); let mut stack = vec![mem::replace(self, empty_ast())]; while let Some(mut ast) = stack.pop() { match ast { @@ -1569,8 +1662,11 @@ impl Drop for Ast { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => {} + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + // Bracketed classes are recursive, so they get their own Drop + // impl. + | Ast::ClassBracketed(_) => {} Ast::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } @@ -1663,9 +1759,9 @@ mod tests { let run = || { let span = || Span::splat(Position::new(0, 0, 0)); - let mut ast = Ast::Empty(span()); + let mut ast = Ast::empty(span()); for i in 0..200 { - ast = Ast::Group(Group { + ast = Ast::group(Group { span: span(), kind: GroupKind::CaptureIndex(i), ast: Box::new(ast), @@ -1694,4 +1790,20 @@ mod tests { .join() .unwrap(); } + + // This tests that our `Ast` has a reasonable size. This isn't a hard rule + // and it can be increased if given a good enough reason. But this test + // exists because the size of `Ast` was at one point over 200 bytes on a + // 64-bit target. Wow. + #[test] + fn ast_size() { + let max = 2 * core::mem::size_of::(); + let size = core::mem::size_of::(); + assert!( + size <= max, + "Ast size of {} bytes is bigger than suggested max {}", + size, + max + ); + } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 47ea2586b4..593b14fbc3 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -53,11 +53,11 @@ impl Primitive { /// Convert this primitive into a proper AST. fn into_ast(self) -> Ast { match self { - Primitive::Literal(lit) => Ast::Literal(lit), - Primitive::Assertion(assert) => Ast::Assertion(assert), - Primitive::Dot(span) => Ast::Dot(span), - Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), - Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), + Primitive::Literal(lit) => Ast::literal(lit), + Primitive::Assertion(assert) => Ast::assertion(assert), + Primitive::Dot(span) => Ast::dot(span), + Primitive::Perl(cls) => Ast::class_perl(cls), + Primitive::Unicode(cls) => Ast::class_unicode(cls), } } @@ -691,7 +691,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.set(v); } - concat.asts.push(Ast::Flags(set)); + concat.asts.push(Ast::flags(set)); Ok(concat) } Either::Right(group) => { @@ -764,7 +764,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { group.ast = Box::new(group_concat.into_ast()); } } - prior_concat.asts.push(Ast::Group(group)); + prior_concat.asts.push(Ast::group(group)); Ok(prior_concat) } @@ -783,7 +783,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(GroupState::Alternation(mut alt)) => { alt.span.end = self.pos(); alt.asts.push(concat.into_ast()); - Ok(Ast::Alternation(alt)) + Ok(Ast::alternation(alt)) } Some(GroupState::Group { group, .. }) => { return Err( @@ -850,7 +850,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { fn pop_class( &self, nested_union: ast::ClassSetUnion, - ) -> Result> { + ) -> Result> { assert_eq!(self.char(), ']'); let item = ast::ClassSet::Item(nested_union.into_item()); @@ -882,7 +882,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { set.span.end = self.pos(); set.kind = prevset; if stack.is_empty() { - Ok(Either::Right(ast::Class::Bracketed(set))) + Ok(Either::Right(set)) } else { union.push(ast::ClassSetItem::Bracketed(Box::new(set))); Ok(Either::Left(union)) @@ -976,7 +976,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; - concat.asts.push(Ast::Class(class)); + concat.asts.push(Ast::class_bracketed(class)); } '?' => { concat = self.parse_uncounted_repetition( @@ -1057,7 +1057,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { greedy = false; self.bump(); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: Span::new(op_start, self.pos()), @@ -1159,7 +1159,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) ); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: op_span, @@ -1212,7 +1212,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } else if self.bump_if("?") { if self.is_eof() { @@ -1241,7 +1241,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } else { @@ -1249,7 +1249,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } @@ -1528,18 +1528,115 @@ impl<'s, P: Borrow> ParserI<'s, P> { span, kind: ast::AssertionKind::EndText, })), - 'b' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::WordBoundary, - })), + 'b' => { + let mut wb = ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundary, + }; + // After a \b, we "try" to parse things like \b{start} for + // special word boundary assertions. + if !self.is_eof() && self.char() == '{' { + if let Some(kind) = + self.maybe_parse_special_word_boundary(start)? + { + wb.kind = kind; + wb.span.end = self.pos(); + } + } + Ok(Primitive::Assertion(wb)) + } 'B' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::NotWordBoundary, })), + '<' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryStartAngle, + })), + '>' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryEndAngle, + })), _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), } } + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary( + &self, + wb_start: Position, + ) -> Result> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(wb_start, self.pos()), + ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + )); + } + let start_contents = self.pos(); + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.parser().pos.set(start); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + while !self.is_eof() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::SpecialWordBoundaryUnclosed, + )); + } + let end = self.pos(); + self.bump(); + let kind = match scratch.as_str() { + "start" => ast::AssertionKind::WordBoundaryStart, + "end" => ast::AssertionKind::WordBoundaryEnd, + "start-half" => ast::AssertionKind::WordBoundaryStartHalf, + "end-half" => ast::AssertionKind::WordBoundaryEndHalf, + _ => { + return Err(self.error( + Span::new(start_contents, end), + ast::ErrorKind::SpecialWordBoundaryUnrecognized, + )) + } + }; + Ok(Some(kind)) + } + /// Parse an octal representation of a Unicode codepoint up to 3 digits /// long. This expects the parser to be positioned at the first octal /// digit and advances the parser to the first character immediately @@ -1743,7 +1840,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// is successful, then the parser is advanced to the position immediately /// following the closing `]`. #[inline(never)] - fn parse_set_class(&self) -> Result { + fn parse_set_class(&self) -> Result { assert_eq!(self.char(), '['); let mut union = @@ -1967,9 +2064,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { // because parsing cannot fail with any interesting error. For example, // in order to use an ASCII character class, it must be enclosed in // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think - // of it as "ASCII character characters have the syntax `[:NAME:]` - // which can only appear within character brackets." This means that - // things like `[[:lower:]A]` are legal constructs. + // of it as "ASCII character classes have the syntax `[:NAME:]` which + // can only appear within character brackets." This means that things + // like `[[:lower:]A]` are legal constructs. // // However, if one types an incorrect ASCII character class, e.g., // `[[:loower:]]`, then we treat that as a normal nested character @@ -2189,12 +2286,12 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't increment depth. return Ok(()); } - Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, @@ -2210,12 +2307,12 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't decrement depth. Ok(()) } - Ast::Class(ast::Class::Bracketed(_)) + Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) | Ast::Alternation(_) @@ -2426,12 +2523,12 @@ mod tests { /// Create a meta literal starting at the given position. fn meta_lit(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) } /// Create a verbatim literal with the given span. fn lit_with(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Verbatim, c, @@ -2445,17 +2542,17 @@ mod tests { /// Create a concatenation with the given span. fn concat_with(span: Span, asts: Vec) -> Ast { - Ast::Concat(ast::Concat { span, asts }) + Ast::concat(ast::Concat { span, asts }) } /// Create an alternation with the given span. fn alt(range: Range, asts: Vec) -> Ast { - Ast::Alternation(ast::Alternation { span: span(range), asts }) + Ast::alternation(ast::Alternation { span: span(range), asts }) } /// Create a capturing group with the given span. fn group(range: Range, index: u32, ast: Ast) -> Ast { - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span(range), kind: ast::GroupKind::CaptureIndex(index), ast: Box::new(ast), @@ -2488,7 +2585,7 @@ mod tests { }, ); } - Ast::Flags(ast::SetFlags { + Ast::flags(ast::SetFlags { span: span_range(pat, range.clone()), flags: ast::Flags { span: span_range(pat, (range.start + 2)..(range.end - 1)), @@ -2502,7 +2599,7 @@ mod tests { // A nest limit of 0 still allows some types of regexes. assert_eq!( parser_nest_limit("", 0).parse(), - Ok(Ast::Empty(span(0..0))) + Ok(Ast::empty(span(0..0))) ); assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); @@ -2516,7 +2613,7 @@ mod tests { ); assert_eq!( parser_nest_limit("a+", 1).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2542,14 +2639,14 @@ mod tests { ); assert_eq!( parser_nest_limit("a+*", 2).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(2..3), kind: ast::RepetitionKind::ZeroOrMore, }, greedy: true, - ast: Box::new(Ast::Repetition(ast::Repetition { + ast: Box::new(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2606,7 +2703,7 @@ mod tests { ); assert_eq!( parser_nest_limit("[a]", 1).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( @@ -2616,7 +2713,7 @@ mod tests { c: 'a', } )), - }))) + })) ); assert_eq!( parser_nest_limit("[ab]", 1).parse().unwrap_err(), @@ -2776,7 +2873,7 @@ bar vec![ lit_with('a', span_range(pat, 0..1)), lit_with(' ', span_range(pat, 1..2)), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 2..9), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 4..5), @@ -2803,7 +2900,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -2825,7 +2922,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit_with('a', span_range(pat, 7..8))), @@ -2840,7 +2937,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 8..8), @@ -2858,7 +2955,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..13), kind: ast::LiteralKind::HexBrace( ast::HexLiteralKind::X @@ -2877,7 +2974,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span_range(pat, 4..6), kind: ast::LiteralKind::Superfluous, c: ' ', @@ -2895,9 +2992,9 @@ bar Ok(concat_with( span_range(pat, 0..3), vec![ - Ast::Dot(span_range(pat, 0..1)), + Ast::dot(span_range(pat, 0..1)), lit_with('\n', span_range(pat, 1..2)), - Ast::Dot(span_range(pat, 2..3)), + Ast::dot(span_range(pat, 2..3)), ] )) ); @@ -2933,7 +3030,7 @@ bar fn parse_uncounted_repetition() { assert_eq!( parser(r"a*").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2945,7 +3042,7 @@ bar ); assert_eq!( parser(r"a+").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2958,7 +3055,7 @@ bar assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2970,7 +3067,7 @@ bar ); assert_eq!( parser(r"a??").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -2982,7 +3079,7 @@ bar ); assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2997,7 +3094,7 @@ bar Ok(concat( 0..3, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -3015,7 +3112,7 @@ bar Ok(concat( 0..4, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -3034,7 +3131,7 @@ bar 0..3, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3048,7 +3145,7 @@ bar ); assert_eq!( parser(r"(ab)?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(4..5), @@ -3067,8 +3164,8 @@ bar Ok(alt( 0..3, vec![ - Ast::Empty(span(0..0)), - Ast::Repetition(ast::Repetition { + Ast::empty(span(0..0)), + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3157,7 +3254,7 @@ bar fn parse_counted_repetition() { assert_eq!( parser(r"a{5}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..4), op: ast::RepetitionOp { span: span(1..4), @@ -3171,7 +3268,7 @@ bar ); assert_eq!( parser(r"a{5,}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3185,7 +3282,7 @@ bar ); assert_eq!( parser(r"a{5,9}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3199,7 +3296,7 @@ bar ); assert_eq!( parser(r"a{5}?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3217,7 +3314,7 @@ bar 0..5, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3237,7 +3334,7 @@ bar 0..6, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3255,7 +3352,7 @@ bar assert_eq!( parser(r"a{ 5 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3269,7 +3366,7 @@ bar ); assert_eq!( parser(r"a{ 5 , 9 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..10), op: ast::RepetitionOp { span: span(1..10), @@ -3283,7 +3380,7 @@ bar ); assert_eq!( parser_ignore_whitespace(r"a{5,9} ?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..8), op: ast::RepetitionOp { span: span(1..8), @@ -3295,6 +3392,23 @@ bar ast: Box::new(lit('a', 0)), })) ); + assert_eq!( + parser(r"\b{5,9}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..7), + op: ast::RepetitionOp { + span: span(2..7), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(Ast::assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + })), + })) + ); assert_eq!( parser(r"(?i){0}").parse().unwrap_err(), @@ -3414,7 +3528,7 @@ bar fn parse_alternate() { assert_eq!( parser(r"a|b").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..3), asts: vec![lit('a', 0), lit('b', 2)], })) @@ -3424,7 +3538,7 @@ bar Ok(group( 0..5, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..4), asts: vec![lit('a', 1), lit('b', 3)], }) @@ -3433,14 +3547,14 @@ bar assert_eq!( parser(r"a|b|c").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..5), asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], })) ); assert_eq!( parser(r"ax|by|cz").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..8), asts: vec![ concat(0..2, vec![lit('a', 0), lit('x', 1)]), @@ -3454,7 +3568,7 @@ bar Ok(group( 0..10, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..9), asts: vec![ concat(1..3, vec![lit('a', 1), lit('x', 2)]), @@ -3503,7 +3617,7 @@ bar parser(r"|").parse(), Ok(alt( 0..1, - vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),] + vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] )) ); assert_eq!( @@ -3511,19 +3625,19 @@ bar Ok(alt( 0..2, vec![ - Ast::Empty(span(0..0)), - Ast::Empty(span(1..1)), - Ast::Empty(span(2..2)), + Ast::empty(span(0..0)), + Ast::empty(span(1..1)), + Ast::empty(span(2..2)), ] )) ); assert_eq!( parser(r"a|").parse(), - Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),])) + Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),])) ); assert_eq!( parser(r"|a").parse(), - Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),])) + Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),])) ); assert_eq!( @@ -3533,7 +3647,7 @@ bar 1, alt( 1..2, - vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),] + vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] ) )) ); @@ -3542,7 +3656,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),]) + alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),]) )) ); assert_eq!( @@ -3550,7 +3664,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),]) + alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),]) )) ); @@ -3606,7 +3720,7 @@ bar fn parse_group() { assert_eq!( parser("(?i)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..4), flags: ast::Flags { span: span(2..3), @@ -3621,7 +3735,7 @@ bar ); assert_eq!( parser("(?iU)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..5), flags: ast::Flags { span: span(2..4), @@ -3644,7 +3758,7 @@ bar ); assert_eq!( parser("(?i-U)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..6), flags: ast::Flags { span: span(2..5), @@ -3672,15 +3786,15 @@ bar assert_eq!( parser("()").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..2), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Empty(span(1..1))), + ast: Box::new(Ast::empty(span(1..1))), })) ); assert_eq!( parser("(a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..3), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit('a', 1)), @@ -3688,20 +3802,20 @@ bar ); assert_eq!( parser("(())").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..4), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Group(ast::Group { + ast: Box::new(Ast::group(ast::Group { span: span(1..3), kind: ast::GroupKind::CaptureIndex(2), - ast: Box::new(Ast::Empty(span(2..2))), + ast: Box::new(Ast::empty(span(2..2))), })), })) ); assert_eq!( parser("(?:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..5), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..2), @@ -3713,7 +3827,7 @@ bar assert_eq!( parser("(?i:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..6), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..3), @@ -3729,7 +3843,7 @@ bar ); assert_eq!( parser("(?i-U:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..5), @@ -3818,7 +3932,7 @@ bar fn parse_capture_name() { assert_eq!( parser("(?z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..7), kind: ast::GroupKind::CaptureName { starts_with_p: false, @@ -3833,7 +3947,7 @@ bar ); assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3848,7 +3962,7 @@ bar ); assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3864,7 +3978,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3880,7 +3994,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3896,7 +4010,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..11), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3912,7 +4026,7 @@ bar assert_eq!( parser("(?P)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(9, 1, 9), @@ -3928,7 +4042,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(8, 1, 8), Position::new(8, 1, 8), ))), @@ -3936,7 +4050,7 @@ bar ); assert_eq!( parser("(?P<名字>)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(12, 1, 9), @@ -3952,7 +4066,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(11, 1, 8), Position::new(11, 1, 8), ))), @@ -4381,6 +4495,48 @@ bar kind: ast::AssertionKind::WordBoundary, })) ); + assert_eq!( + parser(r"\b{start}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..9), + kind: ast::AssertionKind::WordBoundaryStart, + })) + ); + assert_eq!( + parser(r"\b{end}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..7), + kind: ast::AssertionKind::WordBoundaryEnd, + })) + ); + assert_eq!( + parser(r"\b{start-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..14), + kind: ast::AssertionKind::WordBoundaryStartHalf, + })) + ); + assert_eq!( + parser(r"\b{end-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..12), + kind: ast::AssertionKind::WordBoundaryEndHalf, + })) + ); + assert_eq!( + parser(r"\<").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryStartAngle, + })) + ); + assert_eq!( + parser(r"\>").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryEndAngle, + })) + ); assert_eq!( parser(r"\B").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { @@ -4418,20 +4574,60 @@ bar kind: ast::ErrorKind::EscapeUnrecognized, } ); - // But also, < and > are banned, so that we may evolve them into - // start/end word boundary assertions. (Not sure if we will...) + + // Starting a special word boundary without any non-whitespace chars + // after the brace makes it ambiguous whether the user meant to write + // a counted repetition (probably not?) or an actual special word + // boundary assertion. assert_eq!( - parser(r"\<").parse_escape().unwrap_err(), + parser(r"\b{").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..3), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, } ); assert_eq!( - parser(r"\>").parse_escape().unwrap_err(), + parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..4), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + } + ); + // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, + // and thus causes the parser to treat it as a counted repetition. + assert_eq!( + parser(r"\b{ ").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + // In this case, we got some valid chars that makes it look like the + // user is writing one of the special word boundary assertions, but + // we forget to close the brace. + assert_eq!( + parser(r"\b{foo").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // We get the same error as above, except it is provoked by seeing a + // char that we know is invalid before seeing a closing brace. + assert_eq!( + parser(r"\b{foo!}").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // And this one occurs when, syntactically, everything looks okay, but + // we don't use a valid spelling of a word boundary assertion. + assert_eq!( + parser(r"\b{foo}").parse_escape().unwrap_err(), + TestError { + span: span(3..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, } ); @@ -4494,15 +4690,15 @@ bar ); assert_eq!( parser_octal(r"\778").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..3), kind: ast::LiteralKind::Octal, c: '?', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: '8', @@ -4512,15 +4708,15 @@ bar ); assert_eq!( parser_octal(r"\7777").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..5), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..4), kind: ast::LiteralKind::Octal, c: '\u{01FF}', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: '7', @@ -4965,15 +5161,15 @@ bar assert_eq!( parser("[[:alnum:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..11), negated: false, kind: itemset(item_ascii(alnum(span(1..10), false))), - }))) + })) ); assert_eq!( parser("[[[:alnum:]]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..13), negated: false, kind: itemset(item_bracket(ast::ClassBracketed { @@ -4981,11 +5177,11 @@ bar negated: false, kind: itemset(item_ascii(alnum(span(2..11), false))), })), - }))) + })) ); assert_eq!( parser("[[:alnum:]&&[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: intersection( @@ -4993,11 +5189,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]--[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: difference( @@ -5005,11 +5201,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]~~[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: symdifference( @@ -5017,20 +5213,20 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), 'a')), - }))) + })) ); assert_eq!( parser(r"[a\]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5044,11 +5240,11 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[a\-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5063,44 +5259,44 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[ab]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] ), - }))) + })) ); assert_eq!( parser("[a-]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] ), - }))) + })) ); assert_eq!( parser("[-a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] ), - }))) + })) ); assert_eq!( parser(r"[\pL]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(item_unicode(ast::ClassUnicode { @@ -5108,11 +5304,11 @@ bar negated: false, kind: ast::ClassUnicodeKind::OneLetter('L'), })), - }))) + })) ); assert_eq!( parser(r"[\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(item_perl(ast::ClassPerl { @@ -5120,11 +5316,11 @@ bar kind: ast::ClassPerlKind::Word, negated: false, })), - }))) + })) ); assert_eq!( parser(r"[a\wz]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5139,20 +5335,20 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[a-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(range(span(1..4), 'a', 'z')), - }))) + })) ); assert_eq!( parser("[a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..8), negated: false, kind: union( @@ -5162,11 +5358,11 @@ bar range(span(4..7), 'x', 'z'), ] ), - }))) + })) ); assert_eq!( parser(r"[\w&&a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5184,11 +5380,11 @@ bar ] ), ), - }))) + })) ); assert_eq!( parser(r"[a-cx-z&&\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5206,11 +5402,11 @@ bar negated: false, })), ), - }))) + })) ); assert_eq!( parser(r"[a--b--c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: difference( @@ -5222,11 +5418,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[a~~b~~c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: symdifference( @@ -5238,11 +5434,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[\^&&^]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5254,11 +5450,11 @@ bar })), itemset(lit(span(5..6), '^')), ), - }))) + })) ); assert_eq!( parser(r"[\&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5270,11 +5466,11 @@ bar })), itemset(lit(span(5..6), '&')), ), - }))) + })) ); assert_eq!( parser(r"[&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: intersection( @@ -5286,13 +5482,13 @@ bar ), itemset(empty(span(5..5))), ), - }))) + })) ); let pat = "[☃-⛄]"; assert_eq!( parser(pat).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span_range(pat, 0..9), negated: false, kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { @@ -5308,20 +5504,20 @@ bar c: '⛄', }, })), - }))) + })) ); assert_eq!( parser(r"[]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), ']')), - }))) + })) ); assert_eq!( parser(r"[]\[]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5335,14 +5531,14 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[\[]]").parse(), Ok(concat( 0..5, vec![ - Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(ast::ClassSetItem::Literal( @@ -5352,8 +5548,8 @@ bar c: '[', } )), - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: ']', @@ -5914,15 +6110,15 @@ bar assert_eq!( parser(r"\pNz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..3), negated: false, kind: ast::ClassUnicodeKind::OneLetter('N'), - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -5932,15 +6128,15 @@ bar ); assert_eq!( parser(r"\p{Greek}z").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..10), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..9), negated: false, kind: ast::ClassUnicodeKind::Named(s("Greek")), - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(9..10), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -6017,23 +6213,23 @@ bar assert_eq!( parser(r"\d").parse(), - Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ok(Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - }))) + })) ); assert_eq!( parser(r"\dz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..3), asts: vec![ - Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - })), - Ast::Literal(ast::Literal { + }), + Ast::literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: 'z', diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 86a87e1439..1ceb3c7faa 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -80,27 +80,21 @@ impl Visitor for Writer { fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast { Ast::Group(ref x) => self.fmt_group_pre(x), - Ast::Class(ast::Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_pre(x) - } + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } } fn visit_post(&mut self, ast: &Ast) -> fmt::Result { - use crate::ast::Class; - match *ast { Ast::Empty(_) => Ok(()), Ast::Flags(ref x) => self.fmt_set_flags(x), Ast::Literal(ref x) => self.fmt_literal(x), Ast::Dot(_) => self.wtr.write_str("."), Ast::Assertion(ref x) => self.fmt_assertion(x), - Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), - Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), - Ast::Class(Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_post(x) - } + Ast::ClassPerl(ref x) => self.fmt_class_perl(x), + Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), Ast::Repetition(ref x) => self.fmt_repetition(x), Ast::Group(ref x) => self.fmt_group_post(x), Ast::Alternation(_) => Ok(()), @@ -267,6 +261,12 @@ impl Writer { EndText => self.wtr.write_str(r"\z"), WordBoundary => self.wtr.write_str(r"\b"), NotWordBoundary => self.wtr.write_str(r"\B"), + WordBoundaryStart => self.wtr.write_str(r"\b{start}"), + WordBoundaryEnd => self.wtr.write_str(r"\b{end}"), + WordBoundaryStartAngle => self.wtr.write_str(r"\<"), + WordBoundaryEndAngle => self.wtr.write_str(r"\>"), + WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"), + WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"), } } diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 03d12a14db..c1bb24d971 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -264,7 +264,7 @@ impl<'a> HeapVisitor<'a> { visitor: &mut V, ) -> Result>, V::Err> { Ok(match *ast { - Ast::Class(ast::Class::Bracketed(ref x)) => { + Ast::ClassBracketed(ref x) => { self.visit_class(x, visitor)?; None } diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index e063390a8f..e3051bf314 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -19,7 +19,7 @@ use crate::unicode; // // Some of the implementation complexity here is a result of me wanting to // preserve the sequential representation without using additional memory. -// In many cases, we do use linear extra memory, but it is at most 2x and it +// In some cases, we do use linear extra memory, but it is at most 2x and it // is amortized. If we relaxed the memory requirements, this implementation // could become much simpler. The extra memory is honestly probably OK, but // character classes (especially of the Unicode variety) can become quite @@ -81,14 +81,45 @@ impl IntervalSet { /// Add a new interval to this set. pub fn push(&mut self, interval: I) { - // TODO: This could be faster. e.g., Push the interval such that - // it preserves canonicalization. - self.ranges.push(interval); - self.canonicalize(); // We don't know whether the new interval added here is considered // case folded, so we conservatively assume that the entire set is // no longer case folded if it was previously. self.folded = false; + + if self.ranges.is_empty() { + self.ranges.push(interval); + return; + } + + // Find the first range that is not greater than the new interval. + // This is the first range that could possibly be unioned with the + // new interval. + let mut drain_end = self.ranges.len(); + while drain_end > 0 + && self.ranges[drain_end - 1].lower() > interval.upper() + && !self.ranges[drain_end - 1].is_contiguous(&interval) + { + drain_end -= 1; + } + + // Try to union the new interval with old intervals backwards. + if drain_end > 0 && self.ranges[drain_end - 1].is_contiguous(&interval) + { + self.ranges[drain_end - 1] = + self.ranges[drain_end - 1].union(&interval).unwrap(); + for i in (0..drain_end - 1).rev() { + if let Some(union) = + self.ranges[drain_end - 1].union(&self.ranges[i]) + { + self.ranges[drain_end - 1] = union; + } else { + self.ranges.drain(i + 1..drain_end - 1); + break; + } + } + } else { + self.ranges.insert(drain_end, interval); + } } /// Return an iterator over all intervals in this set. @@ -192,34 +223,13 @@ impl IntervalSet { // Folks seem to suggest interval or segment trees, but I'd like to // avoid the overhead (both runtime and conceptual) of that. // - // The following is basically my Shitty First Draft. Therefore, in - // order to grok it, you probably need to read each line carefully. - // Simplifications are most welcome! - // // Remember, we can assume the canonical format invariant here, which // says that all ranges are sorted, not overlapping and not adjacent in // each class. let drain_end = self.ranges.len(); - let (mut a, mut b) = (0, 0); - 'LOOP: while a < drain_end && b < other.ranges.len() { - // Basically, the easy cases are when neither range overlaps with - // each other. If the `b` range is less than our current `a` - // range, then we can skip it and move on. - if other.ranges[b].upper() < self.ranges[a].lower() { - b += 1; - continue; - } - // ... similarly for the `a` range. If it's less than the smallest - // `b` range, then we can add it as-is. - if self.ranges[a].upper() < other.ranges[b].lower() { - let range = self.ranges[a]; - self.ranges.push(range); - a += 1; - continue; - } - // Otherwise, we have overlapping ranges. - assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); + let mut b = 0; + for a in 0..drain_end { // This part is tricky and was non-obvious to me without looking // at explicit examples (see the tests). The trickiness stems from // two things: 1) subtracting a range from another range could @@ -231,47 +241,34 @@ impl IntervalSet { // For example, if our `a` range is `a-t` and our next three `b` // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply // subtraction three times before moving on to the next `a` range. - let mut range = self.ranges[a]; + self.ranges.push(self.ranges[a]); + // Only when `b` is not above `a`, `b` might apply to current + // `a` range. while b < other.ranges.len() - && !range.is_intersection_empty(&other.ranges[b]) + && other.ranges[b].lower() <= self.ranges[a].upper() { - let old_range = range; - range = match range.difference(&other.ranges[b]) { - (None, None) => { - // We lost the entire range, so move on to the next - // without adding this one. - a += 1; - continue 'LOOP; + match self.ranges.pop().unwrap().difference(&other.ranges[b]) { + (Some(range1), None) | (None, Some(range1)) => { + self.ranges.push(range1); } - (Some(range1), None) | (None, Some(range1)) => range1, (Some(range1), Some(range2)) => { self.ranges.push(range1); - range2 + self.ranges.push(range2); } - }; - // It's possible that the `b` range has more to contribute - // here. In particular, if it is greater than the original - // range, then it might impact the next `a` range *and* it - // has impacted the current `a` range as much as possible, - // so we can quit. We don't bump `b` so that the next `a` - // range can apply it. - if other.ranges[b].upper() > old_range.upper() { - break; + (None, None) => {} } - // Otherwise, the next `b` range might apply to the current + // The next `b` range might apply to the current // `a` range. b += 1; } - self.ranges.push(range); - a += 1; - } - while a < drain_end { - let range = self.ranges[a]; - self.ranges.push(range); - a += 1; + // It's possible that the last `b` range has more to + // contribute to the next `a`. We don't bump the last + // `b` so that the next `a` range can apply it. + b = b.saturating_sub(1); } + self.ranges.drain(..drain_end); - self.folded = self.folded && other.folded; + self.folded = self.ranges.is_empty() || (self.folded && other.folded); } /// Compute the symmetric difference of the two sets, in place. @@ -282,11 +279,83 @@ impl IntervalSet { /// set. That is, the set will contain all elements in either set, /// but will not contain any elements that are in both sets. pub fn symmetric_difference(&mut self, other: &IntervalSet) { - // TODO(burntsushi): Fix this so that it amortizes allocation. - let mut intersection = self.clone(); - intersection.intersect(other); - self.union(other); - self.difference(&intersection); + if self.ranges.is_empty() { + self.ranges.extend(&other.ranges); + self.folded = other.folded; + return; + } + if other.ranges.is_empty() { + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the symmetric difference to the end of this range, and then drain + // it before we're done. + let drain_end = self.ranges.len(); + let mut b = 0; + let mut b_range = Some(other.ranges[b]); + for a in 0..drain_end { + self.ranges.push(self.ranges[a]); + while b_range + .map_or(false, |r| r.lower() <= self.ranges[a].upper()) + { + let (range1, range2) = match self + .ranges + .pop() + .unwrap() + .symmetric_difference(&b_range.as_ref().unwrap()) + { + (Some(range1), None) | (None, Some(range1)) => { + (Some(range1), None) + } + (Some(range1), Some(range2)) => { + (Some(range1), Some(range2)) + } + (None, None) => (None, None), + }; + if let Some(range) = range1 { + if self.ranges.len() > drain_end + && self.ranges.last().unwrap().is_contiguous(&range) + { + self.ranges + .last_mut() + .map(|last| *last = last.union(&range).unwrap()); + } else { + self.ranges.push(range); + } + } + if let Some(range) = range2 { + self.ranges.push(range); + } + + b_range = if self.ranges.len() > drain_end + && self.ranges.last().unwrap().upper() + > self.ranges[a].upper() + { + Some(*self.ranges.last().unwrap()) + } else { + b += 1; + other.ranges.get(b).cloned() + }; + } + } + while let Some(range) = b_range { + if self.ranges.len() > drain_end + && self.ranges.last().unwrap().is_contiguous(&range) + { + self.ranges + .last_mut() + .map(|last| *last = last.union(&range).unwrap()); + } else { + self.ranges.push(range); + } + b += 1; + b_range = other.ranges.get(b).cloned(); + } + + self.ranges.drain(..drain_end); + self.folded = self.ranges.is_empty() || (self.folded && other.folded); } /// Negate this interval set. @@ -302,28 +371,44 @@ impl IntervalSet { return; } - // There should be a way to do this in-place with constant memory, - // but I couldn't figure out a simple way to do it. So just append - // the negation to the end of this range, and then drain it before - // we're done. - let drain_end = self.ranges.len(); - // We do checked arithmetic below because of the canonical ordering // invariant. if self.ranges[0].lower() > I::Bound::min_value() { - let upper = self.ranges[0].lower().decrement(); - self.ranges.push(I::create(I::Bound::min_value(), upper)); - } - for i in 1..drain_end { - let lower = self.ranges[i - 1].upper().increment(); - let upper = self.ranges[i].lower().decrement(); - self.ranges.push(I::create(lower, upper)); - } - if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { - let lower = self.ranges[drain_end - 1].upper().increment(); - self.ranges.push(I::create(lower, I::Bound::max_value())); + let mut pre_upper = self.ranges[0].upper(); + self.ranges[0] = I::create( + I::Bound::min_value(), + self.ranges[0].lower().decrement(), + ); + for i in 1..self.ranges.len() { + let lower = pre_upper.increment(); + pre_upper = self.ranges[i].upper(); + self.ranges[i] = + I::create(lower, self.ranges[i].lower().decrement()); + } + if pre_upper < I::Bound::max_value() { + self.ranges.push(I::create( + pre_upper.increment(), + I::Bound::max_value(), + )); + } + } else { + for i in 1..self.ranges.len() { + self.ranges[i - 1] = I::create( + self.ranges[i - 1].upper().increment(), + self.ranges[i].lower().decrement(), + ); + } + if self.ranges.last().unwrap().upper() < I::Bound::max_value() { + self.ranges.last_mut().map(|range| { + *range = I::create( + range.upper().increment(), + I::Bound::max_value(), + ) + }); + } else { + self.ranges.pop(); + } } - self.ranges.drain(..drain_end); // We don't need to update whether this set is folded or not, because // it is conservatively preserved through negation. Namely, if a set // is not folded, then it is possible that its negation is folded, for @@ -337,6 +422,7 @@ impl IntervalSet { // of case folded characters. Negating it in turn means that all // equivalence classes in the set are negated, and any equivalence // class that was previously not in the set is now entirely in the set. + self.folded = self.ranges.is_empty() || self.folded; } /// Converts this set into a canonical ordering. @@ -347,24 +433,20 @@ impl IntervalSet { self.ranges.sort(); assert!(!self.ranges.is_empty()); - // Is there a way to do this in-place with constant memory? I couldn't - // figure out a way to do it. So just append the canonicalization to - // the end of this range, and then drain it before we're done. - let drain_end = self.ranges.len(); - for oldi in 0..drain_end { - // If we've added at least one new range, then check if we can - // merge this range in the previously added range. - if self.ranges.len() > drain_end { - let (last, rest) = self.ranges.split_last_mut().unwrap(); - if let Some(union) = last.union(&rest[oldi]) { - *last = union; - continue; - } + // We maintain the canonicalization results in-place at `0..newi`. + // `newi` will keep track of the end of the canonicalized ranges. + let mut newi = 0; + for oldi in 1..self.ranges.len() { + // The last new range gets merged with currnet old range when + // unionable. If not, we update `newi` and store it as a new range. + if let Some(union) = self.ranges[newi].union(&self.ranges[oldi]) { + self.ranges[newi] = union; + } else { + newi += 1; + self.ranges[newi] = self.ranges[oldi]; } - let range = self.ranges[oldi]; - self.ranges.push(range); } - self.ranges.drain(..drain_end); + self.ranges.truncate(newi + 1); } /// Returns true if and only if this class is in a canonical ordering. @@ -486,7 +568,13 @@ pub trait Interval: other: &Self, ) -> (Option, Option) { let union = match self.union(other) { - None => return (Some(self.clone()), Some(other.clone())), + None => { + return if self.upper() < other.lower() { + (Some(self.clone()), Some(other.clone())) + } else { + (Some(other.clone()), Some(self.clone())) + } + } Some(union) => union, }; let intersection = match self.intersect(other) { diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index afcd506e0c..a5a3737f68 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2235,24 +2235,19 @@ impl PreferenceTrie { /// after them and because any removed literals are guaranteed to never /// match. fn minimize(literals: &mut Vec, keep_exact: bool) { - use core::cell::RefCell; - - // MSRV(1.61): Use retain_mut here to avoid interior mutability. - let trie = RefCell::new(PreferenceTrie { + let mut trie = PreferenceTrie { states: vec![], matches: vec![], next_literal_index: 1, - }); + }; let mut make_inexact = vec![]; - literals.retain(|lit| { - match trie.borrow_mut().insert(lit.as_bytes()) { - Ok(_) => true, - Err(i) => { - if !keep_exact { - make_inexact.push(i.checked_sub(1).unwrap()); - } - false + literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + if !keep_exact { + make_inexact.push(i.checked_sub(1).unwrap()); } + false } }); for i in make_inexact { diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6c1d2745e0..ce38ead7b8 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -797,13 +797,18 @@ impl core::fmt::Debug for Literal { /// The high-level intermediate representation of a character class. /// /// A character class corresponds to a set of characters. A character is either -/// defined by a Unicode scalar value or a byte. Unicode characters are used -/// by default, while bytes are used when Unicode mode (via the `u` flag) is -/// disabled. +/// defined by a Unicode scalar value or a byte. /// /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// +/// There are no guarantees about which class variant is used. Generally +/// speaking, the Unicode variat is used whenever a class needs to contain +/// non-ASCII Unicode scalar values. But the Unicode variant can be used even +/// when Unicode mode is disabled. For example, at the time of writing, the +/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class +/// `[a\u00A0]` due to optimizations. +/// /// Note that `Bytes` variant may be produced even when it exclusively matches /// valid UTF-8. This is because a `Bytes` variant represents an intention by /// the author of the regular expression to disable Unicode mode, which in turn @@ -1326,8 +1331,9 @@ impl ClassUnicodeRange { } } -/// A set of characters represented by arbitrary bytes (where one byte -/// corresponds to one character). +/// A set of characters represented by arbitrary bytes. +/// +/// Each byte corresponds to one character. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassBytes { set: IntervalSet, @@ -1629,6 +1635,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -1650,6 +1692,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -1658,28 +1708,36 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option { + pub const fn from_repr(repr: u32) -> Option { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -1704,6 +1762,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -2594,7 +2660,7 @@ pub struct LookSet { /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -2697,13 +2763,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -2782,29 +2857,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } } @@ -2837,9 +2914,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } @@ -3761,7 +3838,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -3779,6 +3856,6 @@ mod tests { let res = format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index aa737a092d..dfa6d40322 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -202,6 +202,30 @@ impl Visitor for Writer { hir::Look::WordUnicodeNegate => { self.wtr.write_str(r"\B")?; } + hir::Look::WordStartAscii => { + self.wtr.write_str(r"(?-u:\b{start})")?; + } + hir::Look::WordEndAscii => { + self.wtr.write_str(r"(?-u:\b{end})")?; + } + hir::Look::WordStartUnicode => { + self.wtr.write_str(r"\b{start}")?; + } + hir::Look::WordEndUnicode => { + self.wtr.write_str(r"\b{end}")?; + } + hir::Look::WordStartHalfAscii => { + self.wtr.write_str(r"(?-u:\b{start-half})")?; + } + hir::Look::WordEndHalfAscii => { + self.wtr.write_str(r"(?-u:\b{end-half})")?; + } + hir::Look::WordStartHalfUnicode => { + self.wtr.write_str(r"\b{start-half}")?; + } + hir::Look::WordEndHalfUnicode => { + self.wtr.write_str(r"\b{end-half}")?; + } }, HirKind::Capture(hir::Capture { ref name, .. }) => { self.wtr.write_str("(")?; diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 5430b51b27..2b500cc2f4 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -337,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { fn visit_pre(&mut self, ast: &Ast) -> Result<()> { match *ast { - Ast::Class(ast::Class::Bracketed(_)) => { + Ast::ClassBracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); @@ -354,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::Concat(ref x) if x.asts.is_empty() => {} Ast::Concat(_) => { self.push(HirFrame::Concat); } - Ast::Alternation(ref x) if x.asts.is_empty() => {} - Ast::Alternation(_) => { + Ast::Alternation(ref x) => { self.push(HirFrame::Alternation); - self.push(HirFrame::AlternationBranch); + if !x.asts.is_empty() { + self.push(HirFrame::AlternationBranch); + } } _ => {} } @@ -386,29 +386,27 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } - Ast::Literal(ref x) => { - match self.ast_literal_to_scalar(x)? { - Either::Right(byte) => self.push_byte(byte), - Either::Left(ch) => { - if !self.flags().unicode() && ch.len_utf8() > 1 { - return Err(self - .error(x.span, ErrorKind::UnicodeNotAllowed)); - } - match self.case_fold_char(x.span, ch)? { - None => self.push_char(ch), - Some(expr) => self.push(HirFrame::Expr(expr)), - } + Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => { + if !self.flags().unicode() && ch.len_utf8() > 1 { + return Err( + self.error(x.span, ErrorKind::UnicodeNotAllowed) + ); + } + match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), } } - // self.push(HirFrame::Expr(self.hir_literal(x)?)); - } - Ast::Dot(span) => { - self.push(HirFrame::Expr(self.hir_dot(span)?)); + }, + Ast::Dot(ref span) => { + self.push(HirFrame::Expr(self.hir_dot(**span)?)); } Ast::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } - Ast::Class(ast::Class::Perl(ref x)) => { + Ast::ClassPerl(ref x) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); @@ -419,11 +417,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::class(hcls))); } } - Ast::Class(ast::Class::Unicode(ref x)) => { + Ast::ClassUnicode(ref x) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } - Ast::Class(ast::Class::Bracketed(ref ast)) => { + Ast::ClassBracketed(ref ast) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( @@ -964,6 +962,34 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } else { hir::Look::WordAsciiNegate }), + ast::AssertionKind::WordBoundaryStart + | ast::AssertionKind::WordBoundaryStartAngle => { + Hir::look(if unicode { + hir::Look::WordStartUnicode + } else { + hir::Look::WordStartAscii + }) + } + ast::AssertionKind::WordBoundaryEnd + | ast::AssertionKind::WordBoundaryEndAngle => { + Hir::look(if unicode { + hir::Look::WordEndUnicode + } else { + hir::Look::WordEndAscii + }) + } + ast::AssertionKind::WordBoundaryStartHalf => { + Hir::look(if unicode { + hir::Look::WordStartHalfUnicode + } else { + hir::Look::WordStartHalfAscii + }) + } + ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { + hir::Look::WordEndHalfUnicode + } else { + hir::Look::WordEndHalfAscii + }), }) } @@ -3626,4 +3652,55 @@ mod tests { ]), ); } + + #[test] + fn regression_alt_empty_concat() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); + } + + #[test] + fn regression_empty_alt() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); + } + + #[test] + fn regression_singleton_alt() { + use crate::{ + ast::{self, Ast}, + hir::Dot, + }; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::dot(span)], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); + } } diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index a552099c6a..20f25db716 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -168,18 +168,6 @@ The following features are available: #![forbid(unsafe_code)] #![deny(missing_docs, rustdoc::broken_intra_doc_links)] #![warn(missing_debug_implementations)] -// MSRV(1.62): Allow unused warnings. Needed for the 'allow' below, -// since the warning is no longer triggered in newer Rust releases. -// Once the 'allow(mutable_borrow_reservation_conflict)' can be -// removed, we can remove the 'allow(renamed_and_removed_lints)' too. -#![allow(renamed_and_removed_lints)] -// MSRV(1.62): This gets triggered on Rust <1.62, and since our MSRV -// is Rust 1.60 at the time of writing, a warning is displayed. But -// the lang team decided the code pattern flagged by this warning is -// OK, so the warning is innocuous. We can remove this explicit allow -// once we get to a Rust release where the warning is no longer -// triggered. I believe that's Rust 1.62. -#![allow(mutable_borrow_reservation_conflict)] #![cfg_attr(docsrs, feature(doc_auto_cfg))] #[cfg(any(test, feature = "std"))] @@ -334,6 +322,9 @@ pub fn is_escapeable_character(c: char) -> bool { // escapeable, \< and \> will result in a parse error. Thus, we can // turn them into something else in the future without it being a // backwards incompatible change. + // + // OK, now we support \< and \>, and we need to retain them as *not* + // escapeable here since the escape sequence is significant. '<' | '>' => false, _ => true, } diff --git a/src/builders.rs b/src/builders.rs index 46c4824c56..c111a96c0b 100644 --- a/src/builders.rs +++ b/src/builders.rs @@ -679,6 +679,7 @@ pub(crate) mod string { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::RegexBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -1246,6 +1247,7 @@ pub(crate) mod string { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::RegexSetBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -1856,6 +1858,7 @@ pub(crate) mod bytes { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::RegexBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -2428,6 +2431,7 @@ pub(crate) mod bytes { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::RegexSetBuilder; /// /// // It may surprise you how big some seemingly small patterns can diff --git a/src/lib.rs b/src/lib.rs index 1e191b6924..6dbd3c2026 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -543,8 +543,10 @@ scalar value, even when it is encoded using multiple bytes. When Unicode mode is disabled (e.g., `(?-u:.)`), then `.` will match a single byte in all cases. * The character classes `\w`, `\d` and `\s` are all Unicode-aware by default. Use `(?-u:\w)`, `(?-u:\d)` and `(?-u:\s)` to get their ASCII-only definitions. -* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. To -get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. +* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. +To get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. This also +applies to the special word boundary assertions. (That is, `\b{start}`, +`\b{end}`, `\b{start-half}`, `\b{end-half}`.) * `^` and `$` are **not** Unicode-aware in multi-line mode. Namely, they only recognize `\n` (assuming CRLF mode is not enabled) and not any of the other forms of line terminators defined by Unicode. @@ -723,12 +725,16 @@ x{n}? exactly n x ### Empty matches
-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
-\B    not a Unicode word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B              not a Unicode word boundary
+\b{start}, \<   a Unicode start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}, \>     a Unicode end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of a Unicode start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of a Unicode end-of-word boundary (\W|\z on the right)
 
The empty regex is valid and matches the empty string. For example, the @@ -856,28 +862,32 @@ Note that this includes all possible escape sequences, even ones that are documented elsewhere.
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\123        octal character code, up to three digits (when enabled)
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\p{Letter}  Unicode character class
-\P{Letter}  negated Unicode character class
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\123            octal character code, up to three digits (when enabled)
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\p{Letter}      Unicode character class
+\P{Letter}      negated Unicode character class
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 
### Perl character classes (Unicode friendly) diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index cc53482cbd..c742b095a1 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -2025,6 +2025,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` +/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); diff --git a/src/regex/string.rs b/src/regex/string.rs index d5908ae0dd..177a2af345 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -2028,6 +2028,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` +/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); diff --git a/testdata/line-terminator.toml b/testdata/line-terminator.toml index 4de72de31e..a398dafa2e 100644 --- a/testdata/line-terminator.toml +++ b/testdata/line-terminator.toml @@ -38,6 +38,18 @@ unescape = true line-terminator = '\xFF' utf8 = false +# This tests a tricky case where the line terminator is set to \r. This ensures +# that the StartLF look-behind assertion is tracked when computing the start +# state. +[[test]] +name = "carriage" +regex = '(?m)^[a-z]+' +haystack = 'ABC\rabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true +line-terminator = '\r' + # This tests that we can set the line terminator to a byte corresponding to a # word character, and things work as expected. [[test]] diff --git a/testdata/regression.toml b/testdata/regression.toml index 03b15d6d54..09b2b1d1ca 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -782,3 +782,21 @@ match-kind = "all" search-kind = "overlapping" unicode = true utf8 = true + +# This tests that the PikeVM and the meta regex agree on a particular regex. +# This test previously failed when the ad hoc engines inside the meta engine +# did not handle quit states correctly. Namely, the Unicode word boundary here +# combined with a non-ASCII codepoint provokes the quit state. The ad hoc +# engines were previously returning a match even after entering the quit state +# if a match had been previously detected, but this is incorrect. The reason +# is that if a quit state is found, then the search must give up *immediately* +# because it prevents the search from finding the "proper" leftmost-first +# match. If it instead returns a match that has been found, it risks reporting +# an improper match, as it did in this case. +# +# See: https://github.com/rust-lang/regex/issues/1046 +[[test]] +name = "non-prefix-literal-quit-state" +regex = '.+\b\n' +haystack = "β77\n" +matches = [[0, 5]] diff --git a/testdata/word-boundary-special.toml b/testdata/word-boundary-special.toml new file mode 100644 index 0000000000..2b5a2a0acf --- /dev/null +++ b/testdata/word-boundary-special.toml @@ -0,0 +1,687 @@ +# These tests are for the "special" word boundary assertions. That is, +# \b{start}, \b{end}, \b{start-half}, \b{end-half}. These are specialty +# assertions for more niche use cases, but hitting those cases without these +# assertions is difficult. For example, \b{start-half} and \b{end-half} are +# used to implement the -w/--word-regexp flag in a grep program. + +# Tests for (?-u:\b{start}) + +[[test]] +name = "word-start-ascii-010" +regex = '\b{start}' +haystack = "a" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-020" +regex = '\b{start}' +haystack = "a " +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-030" +regex = '\b{start}' +haystack = " a " +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-start-ascii-040" +regex = '\b{start}' +haystack = "" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-050" +regex = '\b{start}' +haystack = "ab" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-060" +regex = '\b{start}' +haystack = "𝛃" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-060-bounds" +regex = '\b{start}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-070" +regex = '\b{start}' +haystack = " 𝛃 " +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-080" +regex = '\b{start}' +haystack = "𝛃𐆀" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-090" +regex = '\b{start}' +haystack = "𝛃b" +matches = [[4, 4]] +unicode = false + +[[test]] +name = "word-start-ascii-110" +regex = '\b{start}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = false + +# Tests for (?-u:\b{end}) + +[[test]] +name = "word-end-ascii-010" +regex = '\b{end}' +haystack = "a" +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-ascii-020" +regex = '\b{end}' +haystack = "a " +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-ascii-030" +regex = '\b{end}' +haystack = " a " +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-ascii-040" +regex = '\b{end}' +haystack = "" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-050" +regex = '\b{end}' +haystack = "ab" +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-ascii-060" +regex = '\b{end}' +haystack = "𝛃" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-060-bounds" +regex = '\b{end}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-070" +regex = '\b{end}' +haystack = " 𝛃 " +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-080" +regex = '\b{end}' +haystack = "𝛃𐆀" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-090" +regex = '\b{end}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = false + +[[test]] +name = "word-end-ascii-110" +regex = '\b{end}' +haystack = "b𝛃" +matches = [[1, 1]] +unicode = false + +# Tests for \b{start} + +[[test]] +name = "word-start-unicode-010" +regex = '\b{start}' +haystack = "a" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-020" +regex = '\b{start}' +haystack = "a " +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-030" +regex = '\b{start}' +haystack = " a " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-start-unicode-040" +regex = '\b{start}' +haystack = "" +matches = [] +unicode = true + +[[test]] +name = "word-start-unicode-050" +regex = '\b{start}' +haystack = "ab" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-060" +regex = '\b{start}' +haystack = "𝛃" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-060-bounds" +regex = '\b{start}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-start-unicode-070" +regex = '\b{start}' +haystack = " 𝛃 " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-start-unicode-080" +regex = '\b{start}' +haystack = "𝛃𐆀" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-090" +regex = '\b{start}' +haystack = "𝛃b" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-110" +regex = '\b{start}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = true + +# Tests for \b{end} + +[[test]] +name = "word-end-unicode-010" +regex = '\b{end}' +haystack = "a" +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-unicode-020" +regex = '\b{end}' +haystack = "a " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-unicode-030" +regex = '\b{end}' +haystack = " a " +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-unicode-040" +regex = '\b{end}' +haystack = "" +matches = [] +unicode = true + +[[test]] +name = "word-end-unicode-050" +regex = '\b{end}' +haystack = "ab" +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-unicode-060" +regex = '\b{end}' +haystack = "𝛃" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-unicode-060-bounds" +regex = '\b{end}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-end-unicode-070" +regex = '\b{end}' +haystack = " 𝛃 " +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-unicode-080" +regex = '\b{end}' +haystack = "𝛃𐆀" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-unicode-090" +regex = '\b{end}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-unicode-110" +regex = '\b{end}' +haystack = "b𝛃" +matches = [[5, 5]] +unicode = true + +# Tests for (?-u:\b{start-half}) + +[[test]] +name = "word-start-half-ascii-010" +regex = '\b{start-half}' +haystack = "a" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-020" +regex = '\b{start-half}' +haystack = "a " +matches = [[0, 0], [2, 2]] +unicode = false + +[[test]] +name = "word-start-half-ascii-030" +regex = '\b{start-half}' +haystack = " a " +matches = [[0, 0], [1, 1], [3, 3]] +unicode = false + +[[test]] +name = "word-start-half-ascii-040" +regex = '\b{start-half}' +haystack = "" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-050" +regex = '\b{start-half}' +haystack = "ab" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-060" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-start-half-ascii-060-noutf8" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] +unicode = false +utf8 = false + +[[test]] +name = "word-start-half-ascii-060-bounds" +regex = '\b{start-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-start-half-ascii-070" +regex = '\b{start-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [5, 5], [6, 6]] +unicode = false + +[[test]] +name = "word-start-half-ascii-080" +regex = '\b{start-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [4, 4], [8, 8]] +unicode = false + +[[test]] +name = "word-start-half-ascii-090" +regex = '\b{start-half}' +haystack = "𝛃b" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-start-half-ascii-110" +regex = '\b{start-half}' +haystack = "b𝛃" +matches = [[0, 0], [5, 5]] +unicode = false + +# Tests for (?-u:\b{end-half}) + +[[test]] +name = "word-end-half-ascii-010" +regex = '\b{end-half}' +haystack = "a" +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-half-ascii-020" +regex = '\b{end-half}' +haystack = "a " +matches = [[1, 1], [2, 2]] +unicode = false + +[[test]] +name = "word-end-half-ascii-030" +regex = '\b{end-half}' +haystack = " a " +matches = [[0, 0], [2, 2], [3, 3]] +unicode = false + +[[test]] +name = "word-end-half-ascii-040" +regex = '\b{end-half}' +haystack = "" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-end-half-ascii-050" +regex = '\b{end-half}' +haystack = "ab" +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-half-ascii-060" +regex = '\b{end-half}' +haystack = "𝛃" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-end-half-ascii-060-bounds" +regex = '\b{end-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-end-half-ascii-070" +regex = '\b{end-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [5, 5], [6, 6]] +unicode = false + +[[test]] +name = "word-end-half-ascii-080" +regex = '\b{end-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [4, 4], [8, 8]] +unicode = false + +[[test]] +name = "word-end-half-ascii-090" +regex = '\b{end-half}' +haystack = "𝛃b" +matches = [[0, 0], [5, 5]] +unicode = false + +[[test]] +name = "word-end-half-ascii-110" +regex = '\b{end-half}' +haystack = "b𝛃" +matches = [[1, 1], [5, 5]] +unicode = false + +# Tests for \b{start-half} + +[[test]] +name = "word-start-half-unicode-010" +regex = '\b{start-half}' +haystack = "a" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-020" +regex = '\b{start-half}' +haystack = "a " +matches = [[0, 0], [2, 2]] +unicode = true + +[[test]] +name = "word-start-half-unicode-030" +regex = '\b{start-half}' +haystack = " a " +matches = [[0, 0], [1, 1], [3, 3]] +unicode = true + +[[test]] +name = "word-start-half-unicode-040" +regex = '\b{start-half}' +haystack = "" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-050" +regex = '\b{start-half}' +haystack = "ab" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-060" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-060-bounds" +regex = '\b{start-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-start-half-unicode-070" +regex = '\b{start-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [6, 6]] +unicode = true + +[[test]] +name = "word-start-half-unicode-080" +regex = '\b{start-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [8, 8]] +unicode = true + +[[test]] +name = "word-start-half-unicode-090" +regex = '\b{start-half}' +haystack = "𝛃b" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-110" +regex = '\b{start-half}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = true + +# Tests for \b{end-half} + +[[test]] +name = "word-end-half-unicode-010" +regex = '\b{end-half}' +haystack = "a" +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-half-unicode-020" +regex = '\b{end-half}' +haystack = "a " +matches = [[1, 1], [2, 2]] +unicode = true + +[[test]] +name = "word-end-half-unicode-030" +regex = '\b{end-half}' +haystack = " a " +matches = [[0, 0], [2, 2], [3, 3]] +unicode = true + +[[test]] +name = "word-end-half-unicode-040" +regex = '\b{end-half}' +haystack = "" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-end-half-unicode-050" +regex = '\b{end-half}' +haystack = "ab" +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-half-unicode-060" +regex = '\b{end-half}' +haystack = "𝛃" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-half-unicode-060-bounds" +regex = '\b{end-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-end-half-unicode-070" +regex = '\b{end-half}' +haystack = " 𝛃 " +matches = [[0, 0], [5, 5], [6, 6]] +unicode = true + +[[test]] +name = "word-end-half-unicode-080" +regex = '\b{end-half}' +haystack = "𝛃𐆀" +matches = [[4, 4], [8, 8]] +unicode = true + +[[test]] +name = "word-end-half-unicode-090" +regex = '\b{end-half}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-half-unicode-110" +regex = '\b{end-half}' +haystack = "b𝛃" +matches = [[5, 5]] +unicode = true + +# Specialty tests. + +# Since \r is special cased in the start state computation (to deal with CRLF +# mode), this test ensures that the correct start state is computed when the +# pattern starts with a half word boundary assertion. +[[test]] +name = "word-start-half-ascii-carriage" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC\rabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true + +# Since \n is also special cased in the start state computation, this test +# ensures that the correct start state is computed when the pattern starts with +# a half word boundary assertion. +[[test]] +name = "word-start-half-ascii-linefeed" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC\nabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true + +# Like the carriage return test above, but with a custom line terminator. +[[test]] +name = "word-start-half-ascii-customlineterm" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC!abc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true +line-terminator = '!' diff --git a/tests/lib.rs b/tests/lib.rs index badd57455d..b3f69423d9 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -49,6 +49,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition");