Skip to content

Commit baf5b1e

Browse files
committed
syntax and automata: bump LookSet representation from u16 to u32
This is in preparation for adding 8 new word boundary look-around assertions: \b{start}, \b{end}, \b{start-half} and \b{end-half}, along with Unicode and ASCII-only variants of each. Ref #469
1 parent 6d2b09e commit baf5b1e

File tree

6 files changed

+55
-50
lines changed

6 files changed

+55
-50
lines changed

regex-automata/src/dfa/dense.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -882,20 +882,20 @@ impl Config {
882882
/// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
883883
/// use regex_automata::{dfa::{dense, Automaton}, Input};
884884
///
885-
/// // 600KB isn't enough!
885+
/// // 700KB isn't enough!
886886
/// dense::Builder::new()
887887
/// .configure(dense::Config::new()
888-
/// .determinize_size_limit(Some(600_000))
888+
/// .determinize_size_limit(Some(700_000))
889889
/// )
890890
/// .build(r"\w{20}")
891891
/// .unwrap_err();
892892
///
893-
/// // ... but 700KB probably is!
893+
/// // ... but 800KB probably is!
894894
/// // (Note that auxiliary storage sizes aren't necessarily stable between
895895
/// // releases.)
896896
/// let dfa = dense::Builder::new()
897897
/// .configure(dense::Config::new()
898-
/// .determinize_size_limit(Some(700_000))
898+
/// .determinize_size_limit(Some(800_000))
899899
/// )
900900
/// .build(r"\w{20}")?;
901901
/// let haystack = "A".repeat(20).into_bytes();

regex-automata/src/dfa/onepass.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -2815,7 +2815,7 @@ impl Epsilons {
28152815

28162816
/// Return the set of look-around assertions in these epsilon transitions.
28172817
fn looks(self) -> LookSet {
2818-
LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() }
2818+
LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() }
28192819
}
28202820

28212821
/// Set the look-around assertions on these epsilon transitions.

regex-automata/src/util/determinize/state.rs

+20-19
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ impl StateBuilderEmpty {
197197
}
198198

199199
pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
200-
self.0.extend_from_slice(&[0, 0, 0, 0, 0]);
200+
self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]);
201201
StateBuilderMatches(self.0)
202202
}
203203

@@ -348,16 +348,17 @@ impl StateBuilderNFA {
348348
/// generated by a transition over a "word" byte. (Callers may not always set
349349
/// this. For example, if the NFA has no word boundary assertion, then needing
350350
/// to track whether a state came from a word byte or not is superfluous and
351-
/// wasteful.)
351+
/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition
352+
/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is
353+
/// enabled.
352354
///
353-
/// Byte 1 corresponds to the look-behind assertions that were satisfied by
354-
/// the transition that created this state. This generally only includes the
355-
/// StartLF and Start assertions. (Look-ahead assertions are not tracked as
356-
/// part of states. Instead, these are applied by re-computing the epsilon
357-
/// closure of a state when computing the transition function. See `next` in
358-
/// the parent module.)
355+
/// Bytes 1..5 correspond to the look-behind assertions that were satisfied
356+
/// by the transition that created this state. (Look-ahead assertions are not
357+
/// tracked as part of states. Instead, these are applied by re-computing the
358+
/// epsilon closure of a state when computing the transition function. See
359+
/// `next` in the parent module.)
359360
///
360-
/// Byte 2 corresponds to the set of look-around assertions (including both
361+
/// Bytes 5..9 correspond to the set of look-around assertions (including both
361362
/// look-behind and look-ahead) that appear somewhere in this state's set of
362363
/// NFA state IDs. This is used to determine whether this state's epsilon
363364
/// closure should be re-computed when computing the transition function.
@@ -366,7 +367,7 @@ impl StateBuilderNFA {
366367
/// function, we should only re-compute the epsilon closure if those new
367368
/// assertions are relevant to this particular state.
368369
///
369-
/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer
370+
/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer
370371
/// corresponding to the number of patterns encoded in this state. If the state
371372
/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
372373
/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
@@ -452,7 +453,7 @@ impl<'a> Repr<'a> {
452453
/// state has no conditional epsilon transitions, then there is no need
453454
/// to re-compute the epsilon closure.
454455
fn look_need(&self) -> LookSet {
455-
LookSet::read_repr(&self.0[3..])
456+
LookSet::read_repr(&self.0[5..])
456457
}
457458

458459
/// Returns the total number of match pattern IDs in this state.
@@ -476,7 +477,7 @@ impl<'a> Repr<'a> {
476477
if !self.has_pattern_ids() {
477478
PatternID::ZERO
478479
} else {
479-
let offset = 9 + index * PatternID::SIZE;
480+
let offset = 13 + index * PatternID::SIZE;
480481
// This is OK since we only ever serialize valid PatternIDs to
481482
// states.
482483
wire::read_pattern_id_unchecked(&self.0[offset..]).0
@@ -507,7 +508,7 @@ impl<'a> Repr<'a> {
507508
f(PatternID::ZERO);
508509
return;
509510
}
510-
let mut pids = &self.0[9..self.pattern_offset_end()];
511+
let mut pids = &self.0[13..self.pattern_offset_end()];
511512
while !pids.is_empty() {
512513
let pid = wire::read_u32(pids);
513514
pids = &pids[PatternID::SIZE..];
@@ -539,11 +540,11 @@ impl<'a> Repr<'a> {
539540
fn pattern_offset_end(&self) -> usize {
540541
let encoded = self.encoded_pattern_len();
541542
if encoded == 0 {
542-
return 5;
543+
return 9;
543544
}
544545
// This arithmetic is OK since we were able to address this many bytes
545546
// when writing to the state, thus, it must fit into a usize.
546-
encoded.checked_mul(4).unwrap().checked_add(9).unwrap()
547+
encoded.checked_mul(4).unwrap().checked_add(13).unwrap()
547548
}
548549

549550
/// Returns the total number of *encoded* pattern IDs in this state.
@@ -557,7 +558,7 @@ impl<'a> Repr<'a> {
557558
}
558559
// This unwrap is OK since the total number of patterns is always
559560
// guaranteed to fit into a usize.
560-
usize::try_from(wire::read_u32(&self.0[5..9])).unwrap()
561+
usize::try_from(wire::read_u32(&self.0[9..13])).unwrap()
561562
}
562563
}
563564

@@ -643,7 +644,7 @@ impl<'a> ReprVec<'a> {
643644
/// Mutate the set of look-around (both behind and ahead) assertions that
644645
/// appear at least once in this state's set of NFA states.
645646
fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
646-
set(self.look_need()).write_repr(&mut self.0[3..]);
647+
set(self.look_need()).write_repr(&mut self.0[5..]);
647648
}
648649

649650
/// Add a pattern ID to this state. All match states must have at least
@@ -703,14 +704,14 @@ impl<'a> ReprVec<'a> {
703704
return;
704705
}
705706
let patsize = PatternID::SIZE;
706-
let pattern_bytes = self.0.len() - 9;
707+
let pattern_bytes = self.0.len() - 13;
707708
// Every pattern ID uses 4 bytes, so number of bytes should be
708709
// divisible by 4.
709710
assert_eq!(pattern_bytes % patsize, 0);
710711
// This unwrap is OK since we are guaranteed that the maximum number
711712
// of possible patterns fits into a u32.
712713
let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
713-
wire::NE::write_u32(count32, &mut self.0[5..9]);
714+
wire::NE::write_u32(count32, &mut self.0[9..13]);
714715
}
715716

716717
/// Add an NFA state ID to this state. The order in which NFA states are

regex-automata/src/util/look.rs

+14-12
Original file line numberDiff line numberDiff line change
@@ -125,17 +125,17 @@ impl Look {
125125
/// constructor is guaranteed to return the same look-around variant that
126126
/// one started with within a semver compatible release of this crate.
127127
#[inline]
128-
pub const fn as_repr(self) -> u16 {
128+
pub const fn as_repr(self) -> u32 {
129129
// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
130130
// actual int.
131-
self as u16
131+
self as u32
132132
}
133133

134134
/// Given the underlying representation of a `Look` value, return the
135135
/// corresponding `Look` value if the representation is valid. Otherwise
136136
/// `None` is returned.
137137
#[inline]
138-
pub const fn from_repr(repr: u16) -> Option<Look> {
138+
pub const fn from_repr(repr: u32) -> Option<Look> {
139139
match repr {
140140
0b00_0000_0001 => Some(Look::Start),
141141
0b00_0000_0010 => Some(Look::End),
@@ -191,7 +191,7 @@ pub struct LookSet {
191191
/// range of `u16` values to be represented. For example, even if the
192192
/// current implementation only makes use of the 10 least significant bits,
193193
/// it may use more bits in a future semver compatible release.
194-
pub bits: u16,
194+
pub bits: u32,
195195
}
196196

197197
impl LookSet {
@@ -379,29 +379,31 @@ impl LookSet {
379379
*self = self.intersect(other);
380380
}
381381

382-
/// Return a `LookSet` from the slice given as a native endian 16-bit
382+
/// Return a `LookSet` from the slice given as a native endian 32-bit
383383
/// integer.
384384
///
385385
/// # Panics
386386
///
387-
/// This panics if `slice.len() < 2`.
387+
/// This panics if `slice.len() < 4`.
388388
#[inline]
389389
pub fn read_repr(slice: &[u8]) -> LookSet {
390-
let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
390+
let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
391391
LookSet { bits }
392392
}
393393

394-
/// Write a `LookSet` as a native endian 16-bit integer to the beginning
394+
/// Write a `LookSet` as a native endian 32-bit integer to the beginning
395395
/// of the slice given.
396396
///
397397
/// # Panics
398398
///
399-
/// This panics if `slice.len() < 2`.
399+
/// This panics if `slice.len() < 4`.
400400
#[inline]
401401
pub fn write_repr(self, slice: &mut [u8]) {
402402
let raw = self.bits.to_ne_bytes();
403403
slice[0] = raw[0];
404404
slice[1] = raw[1];
405+
slice[2] = raw[2];
406+
slice[3] = raw[3];
405407
}
406408

407409
/// Checks that all assertions in this set can be matched.
@@ -456,9 +458,9 @@ impl Iterator for LookSetIter {
456458
return None;
457459
}
458460
// We'll never have more than u8::MAX distinct look-around assertions,
459-
// so 'repr' will always fit into a u16.
460-
let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
461-
let look = Look::from_repr(1 << repr)?;
461+
// so 'bit' will always fit into a u16.
462+
let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
463+
let look = Look::from_repr(1 << bit)?;
462464
self.set = self.set.remove(look);
463465
Some(look)
464466
}

regex-automata/tests/hybrid/api.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
5555
let mut cache = dfa.create_cache();
5656

5757
let haystack = "a".repeat(101).into_bytes();
58-
let err = MatchError::gave_up(25);
58+
let err = MatchError::gave_up(24);
5959
// Notice that we make the same amount of progress in each search! That's
6060
// because the cache is reused and already has states to handle the first
6161
// N bytes.
@@ -83,7 +83,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
8383
// OK, if we reset the cache, then we should be able to create more states
8484
// and make more progress with searching for betas.
8585
cache.reset(&dfa);
86-
let err = MatchError::gave_up(27);
86+
let err = MatchError::gave_up(26);
8787
assert_eq!(
8888
Err(err),
8989
dfa.try_search_fwd(&mut cache, &Input::new(&haystack))

regex-syntax/src/hir/mod.rs

+14-12
Original file line numberDiff line numberDiff line change
@@ -1664,17 +1664,17 @@ impl Look {
16641664
/// constructor is guaranteed to return the same look-around variant that
16651665
/// one started with within a semver compatible release of this crate.
16661666
#[inline]
1667-
pub const fn as_repr(self) -> u16 {
1667+
pub const fn as_repr(self) -> u32 {
16681668
// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
16691669
// actual int.
1670-
self as u16
1670+
self as u32
16711671
}
16721672

16731673
/// Given the underlying representation of a `Look` value, return the
16741674
/// corresponding `Look` value if the representation is valid. Otherwise
16751675
/// `None` is returned.
16761676
#[inline]
1677-
pub const fn from_repr(repr: u16) -> Option<Look> {
1677+
pub const fn from_repr(repr: u32) -> Option<Look> {
16781678
match repr {
16791679
0b00_0000_0001 => Some(Look::Start),
16801680
0b00_0000_0010 => Some(Look::End),
@@ -2600,7 +2600,7 @@ pub struct LookSet {
26002600
/// range of `u16` values to be represented. For example, even if the
26012601
/// current implementation only makes use of the 10 least significant bits,
26022602
/// it may use more bits in a future semver compatible release.
2603-
pub bits: u16,
2603+
pub bits: u32,
26042604
}
26052605

26062606
impl LookSet {
@@ -2788,29 +2788,31 @@ impl LookSet {
27882788
*self = self.intersect(other);
27892789
}
27902790

2791-
/// Return a `LookSet` from the slice given as a native endian 16-bit
2791+
/// Return a `LookSet` from the slice given as a native endian 32-bit
27922792
/// integer.
27932793
///
27942794
/// # Panics
27952795
///
2796-
/// This panics if `slice.len() < 2`.
2796+
/// This panics if `slice.len() < 4`.
27972797
#[inline]
27982798
pub fn read_repr(slice: &[u8]) -> LookSet {
2799-
let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
2799+
let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
28002800
LookSet { bits }
28012801
}
28022802

2803-
/// Write a `LookSet` as a native endian 16-bit integer to the beginning
2803+
/// Write a `LookSet` as a native endian 32-bit integer to the beginning
28042804
/// of the slice given.
28052805
///
28062806
/// # Panics
28072807
///
2808-
/// This panics if `slice.len() < 2`.
2808+
/// This panics if `slice.len() < 4`.
28092809
#[inline]
28102810
pub fn write_repr(self, slice: &mut [u8]) {
28112811
let raw = self.bits.to_ne_bytes();
28122812
slice[0] = raw[0];
28132813
slice[1] = raw[1];
2814+
slice[2] = raw[2];
2815+
slice[3] = raw[3];
28142816
}
28152817
}
28162818

@@ -2843,9 +2845,9 @@ impl Iterator for LookSetIter {
28432845
return None;
28442846
}
28452847
// We'll never have more than u8::MAX distinct look-around assertions,
2846-
// so 'repr' will always fit into a u16.
2847-
let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2848-
let look = Look::from_repr(1 << repr)?;
2848+
// so 'bit' will always fit into a u16.
2849+
let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2850+
let look = Look::from_repr(1 << bit)?;
28492851
self.set = self.set.remove(look);
28502852
Some(look)
28512853
}

0 commit comments

Comments
 (0)