Skip to content

Commit 5e9204f

Browse files
committed
automata: fix more out-dated regex-cli commands
That should cover all of them. Closes #1053
1 parent 2e67b6f commit 5e9204f

File tree

10 files changed

+30
-26
lines changed

10 files changed

+30
-26
lines changed

regex-automata/src/dfa/accel.rs

+7-6
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,16 @@
66
// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
77
// DFA with regex-cli:
88
//
9-
// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC
10-
// dense::DFA(
9+
// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
1110
// D 000000:
1211
// Q 000001:
1312
// *000002:
14-
// A 000003: \x00-` => 3, a => 5, b-\xFF => 3
15-
// >000004: \x00-` => 3, a => 4, b-\xFF => 3
16-
// 000005: \x00-\xFF => 2, EOI => 2
17-
// )
13+
// A 000003: \x00-` => 3, a => 8, b-\xFF => 3
14+
// A 000004: \x00-` => 4, a => 7, b-\xFF => 4
15+
// 000005: \x00-` => 4, b-\xFF => 4
16+
// 000006: \x00-` => 3, a => 6, b-\xFF => 3
17+
// 000007: \x00-\xFF => 2, EOI => 2
18+
// 000008: \x00-\xFF => 2, EOI => 2
1819
//
1920
// In particular, state 3 is accelerated (shown via the 'A' indicator) since
2021
// the only way to leave that state once entered is to see an 'a' byte. If

regex-automata/src/dfa/automaton.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1132,7 +1132,7 @@ pub unsafe trait Automaton {
11321132
/// // implementation defined.
11331133
/// //
11341134
/// // N.B. We get '3' by inspecting the state machine using 'regex-cli'.
1135-
/// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`.
1135+
/// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`.
11361136
/// let id = StateID::new(3 * dfa.stride()).unwrap();
11371137
/// let accelerator = dfa.accelerator(id);
11381138
/// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated.

regex-automata/src/dfa/dense.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -1228,8 +1228,9 @@ impl Builder {
12281228
} else {
12291229
let mut set = nfa.byte_class_set().clone();
12301230
// It is important to distinguish any "quit" bytes from all other
1231-
// bytes. Otherwise, a non-quit byte may end up in the same class
1232-
// as a quit byte, and thus cause the DFA stop when it shouldn't.
1231+
// bytes. Otherwise, a non-quit byte may end up in the same
1232+
// class as a quit byte, and thus cause the DFA to stop when it
1233+
// shouldn't.
12331234
//
12341235
// Test case:
12351236
//

regex-automata/src/hybrid/dfa.rs

+6-4
Original file line numberDiff line numberDiff line change
@@ -2103,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> {
21032103
/// Here's an example that justifies 'inline(never)'
21042104
///
21052105
/// ```ignore
2106-
/// regex-cli find hybrid dfa \
2107-
/// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000
2106+
/// regex-cli find match hybrid \
2107+
/// --cache-capacity 100000000 \
2108+
/// -p '\pL{100}'
2109+
/// all-codepoints-utf8-100x
21082110
/// ```
21092111
///
21102112
/// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
@@ -3830,8 +3832,8 @@ impl Config {
38303832
//
38313833
// Test case:
38323834
//
3833-
// regex-cli find hybrid regex -w @conn.json.1000x.log \
3834-
// '^#' '\b10\.55\.182\.100\b'
3835+
// regex-cli find match hybrid --unicode-word-boundary \
3836+
// -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
38353837
if !quit.is_empty() {
38363838
set.add_set(&quit);
38373839
}

regex-automata/src/hybrid/search.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -105,14 +105,14 @@ fn find_fwd_imp(
105105
// PERF: For justification of omitting bounds checks, it gives us a
106106
// ~10% bump in search time. This was used for a benchmark:
107107
//
108-
// regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb
108+
// regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
109109
//
110110
// PERF: For justification for the loop unrolling, we use a few
111111
// different tests:
112112
//
113-
// regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb
114-
// regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb
115-
// regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb
113+
// regex-cli find half hybrid -p '\w{50}' -UBb bigfile
114+
// regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
115+
// regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile
116116
//
117117
// And there are three different configurations:
118118
//
@@ -353,7 +353,7 @@ fn find_rev_imp(
353353
// anchored and on shorter haystacks. However, this still makes a
354354
// difference. Take this command for example:
355355
//
356-
// regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb
356+
// regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile
357357
//
358358
// (Notice that we use 'find hybrid regex', not 'find hybrid dfa'
359359
// like in the justification for the forward direction. The 'regex'

regex-automata/src/nfa/thompson/compiler.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1466,7 +1466,7 @@ impl Compiler {
14661466
// compare and contrast performance of the Pike VM when the code below
14671467
// is active vs the code above. Here's an example to try:
14681468
//
1469-
// regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru'
1469+
// regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
14701470
//
14711471
// With Unicode classes generated below, this search takes about 45s on
14721472
// my machine. But with the compressed version above, the search takes

regex-automata/src/nfa/thompson/map.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037;
6565
/// Specifically, one could observe the difference with std's hashmap via
6666
/// something like the following benchmark:
6767
///
68-
/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
68+
/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
6969
///
7070
/// But to observe that difference, you'd have to modify the code to use
7171
/// std's hashmap.

regex-automata/src/nfa/thompson/nfa.rs

+2-4
Original file line numberDiff line numberDiff line change
@@ -1841,14 +1841,12 @@ impl SparseTransitions {
18411841
// This is an alternative implementation that uses binary search. In
18421842
// some ad hoc experiments, like
18431843
//
1844-
// smallishru=OpenSubtitles2018.raw.sample.smallish.ru
1845-
// regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
1844+
// regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file
18461845
//
18471846
// I could not observe any improvement, and in fact, things seemed to
18481847
// be a bit slower. I can see an improvement in at least one benchmark:
18491848
//
1850-
// allcpssmall=all-codepoints-utf8-10x
1851-
// regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}'
1849+
// regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8
18521850
//
18531851
// Where total search time goes from 3.2s to 2.4s when using binary
18541852
// search.

regex-automata/src/nfa/thompson/range_trie.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ impl State {
594594
// Benchmarks suggest that binary search is just a bit faster than
595595
// straight linear search. Specifically when using the debug tool:
596596
//
597-
// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
597+
// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'"
598598
binary_search(&self.transitions, |t| range.start <= t.range.end)
599599
}
600600

regex-automata/src/util/look.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -1024,7 +1024,9 @@ impl core::fmt::Display for UnicodeWordBoundaryError {
10241024
// There are perhaps other choices as well. Why did I stop at these 4? Because
10251025
// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
10261026
// approach eventually, as the benefits of the DFA approach are somewhat
1027-
// compelling. The 'boundary-words-holmes' benchmark tests this:
1027+
// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that
1028+
// the commands below no longer work. If necessary, we should re-capitulate
1029+
// the benchmark from whole cloth in rebar.)
10281030
//
10291031
// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
10301032
//

0 commit comments

Comments
 (0)