diff --git a/Cargo.lock b/Cargo.lock index 9add9e1c47f6d..878ccc5016598 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4455,7 +4455,6 @@ dependencies = [ "regex-syntax", "serde", "serde_json", - "serde_regex", "sha1", "sha2", "subtle", @@ -8289,16 +8288,6 @@ dependencies = [ "serde", ] -[[package]] -name = "serde_regex" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8136f1a4ea815d7eac4101cfd0b16dc0cb5e1fe1b8609dfd728058656b7badf" -dependencies = [ - "regex", - "serde", -] - [[package]] name = "serde_repr" version = "0.1.13" diff --git a/misc/cargo-vet/config.toml b/misc/cargo-vet/config.toml index ce1dbdeaa8a6f..572fba178415a 100644 --- a/misc/cargo-vet/config.toml +++ b/misc/cargo-vet/config.toml @@ -1314,10 +1314,6 @@ criteria = "safe-to-deploy" version = "1.0.1" criteria = "safe-to-deploy" -[[exemptions.serde_regex]] -version = "1.1.0" -criteria = "safe-to-deploy" - [[exemptions.serde_urlencoded]] version = "0.7.1" criteria = "safe-to-deploy" diff --git a/misc/python/materialize/output_consistency/input_data/operations/text_operations_provider.py b/misc/python/materialize/output_consistency/input_data/operations/text_operations_provider.py index 8ecc36f577934..77196a20a0214 100644 --- a/misc/python/materialize/output_consistency/input_data/operations/text_operations_provider.py +++ b/misc/python/materialize/output_consistency/input_data/operations/text_operations_provider.py @@ -114,8 +114,6 @@ "$ ILIKE $", [TextOperationParam(), LIKE_PARAM], BooleanReturnTypeSpec(), - # TODO: #26177 (ILIKE in data-flow rendering produces incorrect results when used with multiple replicas) - is_enabled=False, ) ) @@ -134,8 +132,6 @@ "$ NOT ILIKE $", [TextOperationParam(), LIKE_PARAM], BooleanReturnTypeSpec(), - # TODO: #26177 (ILIKE in data-flow rendering produces incorrect results when used with multiple replicas) - is_enabled=False, ) ) diff --git a/src/expr/Cargo.toml b/src/expr/Cargo.toml index db8269330b181..570c53c0992de 100644 --- a/src/expr/Cargo.toml +++ b/src/expr/Cargo.toml @@ -50,7 +50,6 @@ regex = "1.7.0" regex-syntax = "0.6.28" serde = { version = "1.0.152", features = ["derive"] } serde_json = "1.0.89" -serde_regex = "1.1.0" sha1 = "0.10.5" sha2 = "0.10.6" subtle = "2.4.1" diff --git a/src/expr/src/scalar/like_pattern.rs b/src/expr/src/scalar/like_pattern.rs index c8e8852275122..be927d8d8573e 100644 --- a/src/expr/src/scalar/like_pattern.rs +++ b/src/expr/src/scalar/like_pattern.rs @@ -14,8 +14,8 @@ use derivative::Derivative; use mz_lowertest::MzReflect; use mz_ore::fmt::FormatBuffer; use mz_proto::{ProtoType, RustType, TryFromProtoError}; +use mz_repr::adt::regex::Regex; use proptest::prelude::{Arbitrary, Strategy}; -use regex::{Regex, RegexBuilder}; use serde::{Deserialize, Serialize}; use crate::scalar::EvalError; @@ -151,7 +151,7 @@ mod matcher { #[derive(Debug, Clone, Deserialize, Serialize, MzReflect)] pub(super) enum MatcherImpl { String(Vec), - Regex(#[serde(with = "serde_regex")] Regex), + Regex(Regex), } } @@ -403,10 +403,7 @@ fn build_regex(subpatterns: &[Subpattern], case_insensitive: bool) -> Result Ok(regex), Err(regex::Error::CompiledTooBig(_)) => Err(EvalError::LikePatternTooLong), Err(e) => Err(EvalError::Internal(format!( diff --git a/src/repr/src/adt/regex.proto b/src/repr/src/adt/regex.proto index 53f3dd0a591b3..4bd15e4c363d1 100644 --- a/src/repr/src/adt/regex.proto +++ b/src/repr/src/adt/regex.proto @@ -14,4 +14,5 @@ package mz_repr.adt.regex; message ProtoRegex { string pattern = 1; bool case_insensitive = 2; + bool dot_matches_new_line = 3; } diff --git a/src/repr/src/adt/regex.rs b/src/repr/src/adt/regex.rs index 36907c9433f66..0a161989aff46 100644 --- a/src/repr/src/adt/regex.rs +++ b/src/repr/src/adt/regex.rs @@ -36,9 +36,10 @@ include!(concat!(env!("OUT_DIR"), "/mz_repr.adt.regex.rs")); /// /// This type wraps [`regex::Regex`] and imbues it with implementations of the /// above traits. Two regexes are considered equal iff their string -/// representation is identical. The [`PartialOrd`], [`Ord`], and [`Hash`] -/// implementations are similarly based upon the string representation. As -/// mentioned above, is not the natural equivalence relation for regexes: for +/// representation is identical, plus flags, such as `case_insensitive`, +/// are identical. The [`PartialOrd`], [`Ord`], and [`Hash`] implementations +/// are similarly based upon the string representation plus flags. As +/// mentioned above, this is not the natural equivalence relation for regexes: for /// example, the regexes `aa*` and `a+` define the same language, but would not /// compare as equal with this implementation of [`PartialEq`]. Still, it is /// often useful to have _some_ equivalence relation available (e.g., to store @@ -47,23 +48,36 @@ include!(concat!(env!("OUT_DIR"), "/mz_repr.adt.regex.rs")); /// /// [regex::Regex] is hard to serialize (because of the compiled code), so our approach is to /// instead serialize this wrapper struct, where we skip serializing the actual regex field, and -/// we reconstruct the regex field from the other two fields upon deserialization. +/// we reconstruct the regex field from the other fields upon deserialization. /// (Earlier, serialization was buggy due to , /// and also making the same mistake in our own protobuf serialization code.) #[derive(Debug, Clone, MzReflect)] pub struct Regex { pub pattern: String, pub case_insensitive: bool, + pub dot_matches_new_line: bool, pub regex: regex::Regex, } impl Regex { + /// A simple constructor for the default setting of `dot_matches_new_line: false`. pub fn new(pattern: String, case_insensitive: bool) -> Result { + Self::new_dot_matches_new_line(pattern, case_insensitive, false) + } + + /// Allows explicitly setting `dot_matches_new_line`. + pub fn new_dot_matches_new_line( + pattern: String, + case_insensitive: bool, + dot_matches_new_line: bool, + ) -> Result { let mut regex_builder = RegexBuilder::new(pattern.as_str()); regex_builder.case_insensitive(case_insensitive); + regex_builder.dot_matches_new_line(dot_matches_new_line); Ok(Regex { pattern, case_insensitive, + dot_matches_new_line, regex: regex_builder.build()?, }) } @@ -71,7 +85,9 @@ impl Regex { impl PartialEq for Regex { fn eq(&self, other: &Regex) -> bool { - self.pattern == other.pattern && self.case_insensitive == other.case_insensitive + self.pattern == other.pattern + && self.case_insensitive == other.case_insensitive + && self.dot_matches_new_line == other.dot_matches_new_line } } @@ -85,7 +101,16 @@ impl PartialOrd for Regex { impl Ord for Regex { fn cmp(&self, other: &Regex) -> Ordering { - (&self.pattern, self.case_insensitive).cmp(&(&other.pattern, other.case_insensitive)) + ( + &self.pattern, + self.case_insensitive, + self.dot_matches_new_line, + ) + .cmp(&( + &other.pattern, + other.case_insensitive, + other.dot_matches_new_line, + )) } } @@ -93,6 +118,7 @@ impl Hash for Regex { fn hash(&self, hasher: &mut H) { self.pattern.hash(hasher); self.case_insensitive.hash(hasher); + self.dot_matches_new_line.hash(hasher); } } @@ -109,11 +135,18 @@ impl RustType for Regex { ProtoRegex { pattern: self.pattern.clone(), case_insensitive: self.case_insensitive, + dot_matches_new_line: self.dot_matches_new_line, } } fn from_proto(proto: ProtoRegex) -> Result { - Ok(Regex::new(proto.pattern, proto.case_insensitive)?) + Ok(Regex::new_dot_matches_new_line( + proto.pattern, + proto.case_insensitive, + // We used to not have the following field. If we happen to deserialize an old object, + // this will default to false, which is exactly what we want. + proto.dot_matches_new_line, + )?) } } @@ -122,9 +155,10 @@ impl Serialize for Regex { where S: Serializer, { - let mut state = serializer.serialize_struct("Regex", 2)?; + let mut state = serializer.serialize_struct("Regex", 3)?; state.serialize_field("pattern", &self.pattern)?; state.serialize_field("case_insensitive", &self.case_insensitive)?; + state.serialize_field("dot_matches_new_line", &self.dot_matches_new_line)?; state.end() } } @@ -137,6 +171,7 @@ impl<'de> Deserialize<'de> for Regex { enum Field { Pattern, CaseInsensitive, + DotMatchesNewLine, } impl<'de> Deserialize<'de> for Field { @@ -150,7 +185,9 @@ impl<'de> Deserialize<'de> for Regex { type Value = Field; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - formatter.write_str("pattern string or case_insensitive bool") + formatter.write_str( + "pattern string or case_insensitive bool or dot_matches_new_line bool", + ) } fn visit_str(self, value: &str) -> Result @@ -160,6 +197,7 @@ impl<'de> Deserialize<'de> for Regex { match value { "pattern" => Ok(Field::Pattern), "case_insensitive" => Ok(Field::CaseInsensitive), + "dot_matches_new_line" => Ok(Field::DotMatchesNewLine), _ => Err(de::Error::unknown_field(value, FIELDS)), } } @@ -188,12 +226,16 @@ impl<'de> Deserialize<'de> for Regex { let case_insensitive = seq .next_element()? .ok_or_else(|| de::Error::invalid_length(1, &self))?; - Regex::new(pattern, case_insensitive).map_err(|err| { - V::Error::custom(format!( - "Unable to recreate regex during deserialization: {}", - err - )) - }) + let dot_matches_new_line = seq + .next_element()? + .ok_or_else(|| de::Error::invalid_length(2, &self))?; + Regex::new_dot_matches_new_line(pattern, case_insensitive, dot_matches_new_line) + .map_err(|err| { + V::Error::custom(format!( + "Unable to recreate regex during deserialization: {}", + err + )) + }) } fn visit_map(self, mut map: V) -> Result @@ -202,6 +244,7 @@ impl<'de> Deserialize<'de> for Regex { { let mut pattern: Option = None; let mut case_insensitive: Option = None; + let mut dot_matches_new_line: Option = None; while let Some(key) = map.next_key()? { match key { Field::Pattern => { @@ -216,21 +259,30 @@ impl<'de> Deserialize<'de> for Regex { } case_insensitive = Some(map.next_value()?); } + Field::DotMatchesNewLine => { + if dot_matches_new_line.is_some() { + return Err(de::Error::duplicate_field("dot_matches_new_line")); + } + dot_matches_new_line = Some(map.next_value()?); + } } } let pattern = pattern.ok_or_else(|| de::Error::missing_field("pattern"))?; let case_insensitive = case_insensitive.ok_or_else(|| de::Error::missing_field("case_insensitive"))?; - Regex::new(pattern, case_insensitive).map_err(|err| { - V::Error::custom(format!( - "Unable to recreate regex during deserialization: {}", - err - )) - }) + let dot_matches_new_line = dot_matches_new_line + .ok_or_else(|| de::Error::missing_field("dot_matches_new_line"))?; + Regex::new_dot_matches_new_line(pattern, case_insensitive, dot_matches_new_line) + .map_err(|err| { + V::Error::custom(format!( + "Unable to recreate regex during deserialization: {}", + err + )) + }) } } - const FIELDS: &[&str] = &["pattern", "case_insensitive"]; + const FIELDS: &[&str] = &["pattern", "case_insensitive", "dot_matches_new_line"]; deserializer.deserialize_struct("Regex", FIELDS, RegexVisitor) } } @@ -247,10 +299,10 @@ const END_SYMBOLS: &str = r"(\$|(\\z))?"; prop_compose! { pub fn any_regex() (b in BEGINNING_SYMBOLS, c in CHARACTERS, - r in REPETITIONS, e in END_SYMBOLS, case_insensitive in any::()) + r in REPETITIONS, e in END_SYMBOLS, case_insensitive in any::(), dot_matches_new_line in any::()) -> Regex { let string = format!("{}{}{}{}", b, c, r, e); - Regex::new(string, case_insensitive).unwrap() + Regex::new_dot_matches_new_line(string, case_insensitive, dot_matches_new_line).unwrap() } } @@ -271,9 +323,9 @@ mod tests { } } - // This was failing before due to the derived serde serialization being incorrect, because of - // . - // Nowadays, we use our own handwritten Serialize/Deserialize impls for our Regex wrapper struct. + /// This was failing before due to the derived serde serialization being incorrect, because of + /// . + /// Nowadays, we use our own handwritten Serialize/Deserialize impls for our Regex wrapper struct. #[mz_ore::test] fn regex_serde_case_insensitive() { let orig_regex = Regex::new("AAA".to_string(), true).unwrap(); @@ -285,4 +337,36 @@ mod tests { assert_eq!(orig_regex.regex.is_match("aaa"), true); assert_eq!(roundtrip_result.regex.is_match("aaa"), true); } + + /// Test the roundtripping of `dot_matches_new_line`. + /// (Similar to the above `regex_serde_case_insensitive`.) + #[mz_ore::test] + fn regex_serde_dot_matches_new_line() { + { + // dot_matches_new_line: true + let orig_regex = + Regex::new_dot_matches_new_line("A.*B".to_string(), true, true).unwrap(); + let serialized: String = serde_json::to_string(&orig_regex).unwrap(); + let roundtrip_result: Regex = serde_json::from_str(&serialized).unwrap(); + assert_eq!(orig_regex.regex.is_match("axxx\nxxxb"), true); + assert_eq!(roundtrip_result.regex.is_match("axxx\nxxxb"), true); + } + { + // dot_matches_new_line: false + let orig_regex = + Regex::new_dot_matches_new_line("A.*B".to_string(), true, false).unwrap(); + let serialized: String = serde_json::to_string(&orig_regex).unwrap(); + let roundtrip_result: Regex = serde_json::from_str(&serialized).unwrap(); + assert_eq!(orig_regex.regex.is_match("axxx\nxxxb"), false); + assert_eq!(roundtrip_result.regex.is_match("axxx\nxxxb"), false); + } + { + // dot_matches_new_line: default + let orig_regex = Regex::new("A.*B".to_string(), true).unwrap(); + let serialized: String = serde_json::to_string(&orig_regex).unwrap(); + let roundtrip_result: Regex = serde_json::from_str(&serialized).unwrap(); + assert_eq!(orig_regex.regex.is_match("axxx\nxxxb"), false); + assert_eq!(roundtrip_result.regex.is_match("axxx\nxxxb"), false); + } + } } diff --git a/test/sqllogictest/explain/optimized_plan_as_json.slt b/test/sqllogictest/explain/optimized_plan_as_json.slt index bb77dad07988b..fd51dc6f603af 100644 --- a/test/sqllogictest/explain/optimized_plan_as_json.slt +++ b/test/sqllogictest/explain/optimized_plan_as_json.slt @@ -5747,7 +5747,8 @@ SELECT s FROM t2 WHERE s ~ 'a.*'; "func": { "IsRegexpMatch": { "pattern": "a.*", - "case_insensitive": false + "case_insensitive": false, + "dot_matches_new_line": false } }, "expr": { @@ -5780,7 +5781,8 @@ SELECT s FROM t2 WHERE s ~ 'a.*'; "func": { "IsRegexpMatch": { "pattern": "a.*", - "case_insensitive": false + "case_insensitive": false, + "dot_matches_new_line": false } }, "expr": { diff --git a/test/sqllogictest/like.slt b/test/sqllogictest/like.slt index 26e3b287d5e96..9a3e97d5f38bb 100644 --- a/test/sqllogictest/like.slt +++ b/test/sqllogictest/like.slt @@ -130,3 +130,67 @@ SELECT s FROM t WHERE s ILIKE like_pat; ---- ABC abc + +# Regression test for https://github.com/MaterializeInc/materialize/issues/26177 + +statement ok +CREATE TABLE t2 (text_val TEXT); + +statement ok +INSERT INTO t2 VALUES ('abc'); + +statement ok +CREATE CLUSTER test SIZE '2-1'; + +statement ok +SET cluster = test; + +query B +SELECT text_val NOT ILIKE '%A%' FROM t2; +---- +false + +query B +SELECT text_val NOT ILIKE '%A%' FROM t2; +---- +false + +query B +SELECT text_val NOT ILIKE '%A%' FROM t2; +---- +false + +query B +SELECT text_val NOT ILIKE '%A%' FROM t2; +---- +false + +query B +SELECT text_val NOT ILIKE '%A%' FROM t2; +---- +false + +query B +SELECT text_val NOT ILIKE '%A%' FROM t2; +---- +false + +query B +SELECT text_val NOT ILIKE '%A%' FROM t2; +---- +false + +# Test that % matches \n +query B +SELECT E'test line 1\ntest line 2' LIKE '%1%2%'; +---- +true + +statement ok +INSERT INTO t2 VALUES ('test line 1\ntest line 2'); + +query B +SELECT text_val LIKE '%1%2%' FROM t2; +---- +false +true