Skip to content

Commit edcc1bf

Browse files
authored
Merge pull request #8011 from Scoopit/right-way-to-shorten-a-string
fix: Use unicode_segmentation to truncate INSERT statement
2 parents b8f1a29 + 0c57f56 commit edcc1bf

File tree

3 files changed

+46
-1
lines changed

3 files changed

+46
-1
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/query/legacy-parser/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ common-legacy-expression = { path = "../legacy-expression" }
1616

1717
async-trait = "0.1.57"
1818
sqlparser = { git = "https://github.com/datafuse-extras/sqlparser-rs", rev = "7f246e3" }
19+
unicode-segmentation = "^1.2"

src/query/legacy-parser/src/sql_common.rs

+44-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use common_datavalues::prelude::*;
1818
use common_exception::ErrorCode;
1919
use common_exception::Result;
2020
use sqlparser::ast::DataType as SQLDataType;
21+
use unicode_segmentation::UnicodeSegmentation;
2122

2223
pub struct SQLCommon;
2324

@@ -126,9 +127,51 @@ impl SQLCommon {
126127
pub fn short_sql(query: &str) -> String {
127128
let query = query.trim_start();
128129
if query.len() >= 64 && query[..6].eq_ignore_ascii_case("INSERT") {
129-
format!("{}...", &query[..64])
130+
// keep first 64 graphemes
131+
String::from_utf8(
132+
query
133+
.graphemes(true)
134+
.take(64)
135+
.flat_map(|g| g.as_bytes().iter())
136+
.copied() // copied converts &u8 into u8
137+
.chain(b"...".iter().copied())
138+
.collect::<Vec<u8>>(),
139+
)
140+
.unwrap() // by construction, this cannot panic as we extracted unicode grapheme
130141
} else {
131142
query.to_string()
132143
}
133144
}
134145
}
146+
147+
#[cfg(test)]
148+
mod test {
149+
use crate::sql_common::SQLCommon;
150+
151+
const LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT: &str =
152+
"INSERT INTO `test` VALUES ('abcd', 'def'),('abcd', 'def'),('abcé', 'def');";
153+
154+
#[test]
155+
#[should_panic]
156+
fn test_invalid_string_truncation() {
157+
// This test checks the INSERT statement did panic with byte truncated string.
158+
// We need to do this to validate that the code of short_sql has fixed this panic!
159+
format!("{}...", &LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT[..64]);
160+
}
161+
162+
#[test]
163+
fn test_short_sql_truncation_on_unicode() {
164+
// short insert into statements are not truncated
165+
assert_eq!(
166+
SQLCommon::short_sql("INSERT INTO `test` VALUES('abcd', 'def');"),
167+
"INSERT INTO `test` VALUES('abcd', 'def');"
168+
);
169+
// long one are at 64th char...
170+
let shortned = SQLCommon::short_sql(LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT);
171+
assert_eq!(shortned.len(), 68); // 64 chars with a multibyte one (é) + ...
172+
assert_eq!(
173+
shortned,
174+
"INSERT INTO `test` VALUES ('abcd', 'def'),('abcd', 'def'),('abcé..."
175+
);
176+
}
177+
}

0 commit comments

Comments
 (0)