Skip to content

Commit 3feff9e

Browse files
committed
automata: improve sparse DFA validation
This rejiggers some code so that we can more reliably check whether start state IDs are valid or not. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62726
1 parent 914198f commit 3feff9e

File tree

2 files changed

+83
-76
lines changed

2 files changed

+83
-76
lines changed

regex-automata/src/dfa/sparse.rs

+83-76
Original file line numberDiff line numberDiff line change
@@ -992,8 +992,8 @@ impl<'a> DFA<&'a [u8]> {
992992
// (by trying to decode every state) and start state ID list below. If
993993
// either validation fails, then we return an error.
994994
let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
995-
dfa.tt.validate(&dfa.special)?;
996-
dfa.st.validate(&dfa.special, &dfa.tt)?;
995+
let seen = dfa.tt.validate(&dfa.special)?;
996+
dfa.st.validate(&dfa.special, &seen)?;
997997
// N.B. dfa.special doesn't have a way to do unchecked deserialization,
998998
// so it has already been validated.
999999
Ok((dfa, nread))
@@ -1388,63 +1388,8 @@ impl<T: AsRef<[u8]>> Transitions<T> {
13881388
///
13891389
/// That is, every state ID can be used to correctly index a state in this
13901390
/// table.
1391-
fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
1392-
// In order to validate everything, we not only need to make sure we
1393-
// can decode every state, but that every transition in every state
1394-
// points to a valid state. There are many duplicative transitions, so
1395-
// we record state IDs that we've verified so that we don't redo the
1396-
// decoding work.
1397-
//
1398-
// Except, when in no_std mode, we don't have dynamic memory allocation
1399-
// available to us, so we skip this optimization. It's not clear
1400-
// whether doing something more clever is worth it just yet. If you're
1401-
// profiling this code and need it to run faster, please file an issue.
1402-
//
1403-
// OK, so we also use this to record the set of valid state IDs. Since
1404-
// it is possible for a transition to point to an invalid state ID that
1405-
// still (somehow) deserializes to a valid state. So we need to make
1406-
// sure our transitions are limited to actually correct state IDs.
1407-
// The problem is, I'm not sure how to do this verification step in
1408-
// no-std no-alloc mode. I think we'd *have* to store the set of valid
1409-
// state IDs in the DFA itself. For now, we don't do this verification
1410-
// in no-std no-alloc mode. The worst thing that can happen is an
1411-
// incorrect result. But no panics or memory safety problems should
1412-
// result. Because we still do validate that the state itself is
1413-
// "valid" in the sense that everything it points to actually exists.
1414-
//
1415-
// ---AG
1416-
struct Seen {
1417-
#[cfg(feature = "alloc")]
1418-
set: alloc::collections::BTreeSet<StateID>,
1419-
#[cfg(not(feature = "alloc"))]
1420-
set: core::marker::PhantomData<StateID>,
1421-
}
1422-
1423-
#[cfg(feature = "alloc")]
1424-
impl Seen {
1425-
fn new() -> Seen {
1426-
Seen { set: alloc::collections::BTreeSet::new() }
1427-
}
1428-
fn insert(&mut self, id: StateID) {
1429-
self.set.insert(id);
1430-
}
1431-
fn contains(&self, id: &StateID) -> bool {
1432-
self.set.contains(id)
1433-
}
1434-
}
1435-
1436-
#[cfg(not(feature = "alloc"))]
1437-
impl Seen {
1438-
fn new() -> Seen {
1439-
Seen { set: core::marker::PhantomData }
1440-
}
1441-
fn insert(&mut self, _id: StateID) {}
1442-
fn contains(&self, _id: &StateID) -> bool {
1443-
false
1444-
}
1445-
}
1446-
1447-
let mut verified: Seen = Seen::new();
1391+
fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> {
1392+
let mut verified = Seen::new();
14481393
// We need to make sure that we decode the correct number of states.
14491394
// Otherwise, an empty set of transitions would validate even if the
14501395
// recorded state length is non-empty.
@@ -1521,7 +1466,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
15211466
"mismatching sparse state length",
15221467
));
15231468
}
1524-
Ok(())
1469+
Ok(verified)
15251470
}
15261471

15271472
/// Converts these transitions to a borrowed value.
@@ -1659,7 +1604,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
16591604
let state = &state[nr..];
16601605
if npats == 0 {
16611606
return Err(DeserializeError::generic(
1662-
"state marked as a match, but has no pattern IDs",
1607+
"state marked as a match, but pattern length is zero",
16631608
));
16641609
}
16651610

@@ -1681,6 +1626,21 @@ impl<T: AsRef<[u8]>> Transitions<T> {
16811626
} else {
16821627
(&[][..], state)
16831628
};
1629+
if is_match && pattern_ids.is_empty() {
1630+
return Err(DeserializeError::generic(
1631+
"state marked as a match, but has no pattern IDs",
1632+
));
1633+
}
1634+
if sp.is_match_state(id) && pattern_ids.is_empty() {
1635+
return Err(DeserializeError::generic(
1636+
"state marked special as a match, but has no pattern IDs",
1637+
));
1638+
}
1639+
if sp.is_match_state(id) != is_match {
1640+
return Err(DeserializeError::generic(
1641+
"whether state is a match or not is inconsistent",
1642+
));
1643+
}
16841644

16851645
// Now read this state's accelerator info. The first byte is the length
16861646
// of the accelerator, which is typically 0 (for no acceleration) but
@@ -2061,28 +2021,19 @@ impl<T: AsRef<[u8]>> StartTable<T> {
20612021
fn validate(
20622022
&self,
20632023
sp: &Special,
2064-
trans: &Transitions<T>,
2024+
seen: &Seen,
20652025
) -> Result<(), DeserializeError> {
20662026
for (id, _, _) in self.iter() {
2027+
if !seen.contains(&id) {
2028+
return Err(DeserializeError::generic(
2029+
"found invalid start state ID",
2030+
));
2031+
}
20672032
if sp.is_match_state(id) {
20682033
return Err(DeserializeError::generic(
20692034
"start states cannot be match states",
20702035
));
20712036
}
2072-
// Confirm that the start state points to a valid state.
2073-
let state = trans.try_state(sp, id)?;
2074-
// And like for the transition table, confirm that the transitions
2075-
// on all start states themselves point to a valid state.
2076-
//
2077-
// It'd probably be better to integrate this validation with the
2078-
// transition table, or otherwise store a sorted sequence of all
2079-
// valid state IDs in the sparse DFA itself. That way, we could
2080-
// check that every pointer to a state corresponds precisely to a
2081-
// correct and valid state.
2082-
for i in 0..state.ntrans {
2083-
let to = state.next_at(i);
2084-
let _ = trans.try_state(sp, to)?;
2085-
}
20862037
}
20872038
Ok(())
20882039
}
@@ -2537,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> {
25372488
}
25382489
}
25392490

2491+
// In order to validate everything, we not only need to make sure we
2492+
// can decode every state, but that every transition in every state
2493+
// points to a valid state. There are many duplicative transitions, so
2494+
// we record state IDs that we've verified so that we don't redo the
2495+
// decoding work.
2496+
//
2497+
// Except, when in no_std mode, we don't have dynamic memory allocation
2498+
// available to us, so we skip this optimization. It's not clear
2499+
// whether doing something more clever is worth it just yet. If you're
2500+
// profiling this code and need it to run faster, please file an issue.
2501+
//
2502+
// OK, so we also use this to record the set of valid state IDs. Since
2503+
// it is possible for a transition to point to an invalid state ID that
2504+
// still (somehow) deserializes to a valid state. So we need to make
2505+
// sure our transitions are limited to actually correct state IDs.
2506+
// The problem is, I'm not sure how to do this verification step in
2507+
// no-std no-alloc mode. I think we'd *have* to store the set of valid
2508+
// state IDs in the DFA itself. For now, we don't do this verification
2509+
// in no-std no-alloc mode. The worst thing that can happen is an
2510+
// incorrect result. But no panics or memory safety problems should
2511+
// result. Because we still do validate that the state itself is
2512+
// "valid" in the sense that everything it points to actually exists.
2513+
//
2514+
// ---AG
2515+
#[derive(Debug)]
2516+
struct Seen {
2517+
#[cfg(feature = "alloc")]
2518+
set: alloc::collections::BTreeSet<StateID>,
2519+
#[cfg(not(feature = "alloc"))]
2520+
set: core::marker::PhantomData<StateID>,
2521+
}
2522+
2523+
#[cfg(feature = "alloc")]
2524+
impl Seen {
2525+
fn new() -> Seen {
2526+
Seen { set: alloc::collections::BTreeSet::new() }
2527+
}
2528+
fn insert(&mut self, id: StateID) {
2529+
self.set.insert(id);
2530+
}
2531+
fn contains(&self, id: &StateID) -> bool {
2532+
self.set.contains(id)
2533+
}
2534+
}
2535+
2536+
#[cfg(not(feature = "alloc"))]
2537+
impl Seen {
2538+
fn new() -> Seen {
2539+
Seen { set: core::marker::PhantomData }
2540+
}
2541+
fn insert(&mut self, _id: StateID) {}
2542+
fn contains(&self, _id: &StateID) -> bool {
2543+
false
2544+
}
2545+
}
2546+
25402547
/*
25412548
/// A binary search routine specialized specifically to a sparse DFA state's
25422549
/// transitions. Specifically, the transitions are defined as a set of pairs

0 commit comments

Comments
 (0)