@@ -18,6 +18,7 @@ use common_datavalues::prelude::*;
18
18
use common_exception:: ErrorCode ;
19
19
use common_exception:: Result ;
20
20
use sqlparser:: ast:: DataType as SQLDataType ;
21
+ use unicode_segmentation:: UnicodeSegmentation ;
21
22
22
23
pub struct SQLCommon ;
23
24
@@ -126,9 +127,51 @@ impl SQLCommon {
126
127
pub fn short_sql ( query : & str ) -> String {
127
128
let query = query. trim_start ( ) ;
128
129
if query. len ( ) >= 64 && query[ ..6 ] . eq_ignore_ascii_case ( "INSERT" ) {
129
- format ! ( "{}..." , & query[ ..64 ] )
130
+ // keep first 64 graphemes
131
+ String :: from_utf8 (
132
+ query
133
+ . graphemes ( true )
134
+ . take ( 64 )
135
+ . flat_map ( |g| g. as_bytes ( ) . iter ( ) )
136
+ . copied ( ) // copied converts &u8 into u8
137
+ . chain ( b"..." . iter ( ) . copied ( ) )
138
+ . collect :: < Vec < u8 > > ( ) ,
139
+ )
140
+ . unwrap ( ) // by construction, this cannot panic as we extracted unicode grapheme
130
141
} else {
131
142
query. to_string ( )
132
143
}
133
144
}
134
145
}
146
+
147
+ #[ cfg( test) ]
148
+ mod test {
149
+ use crate :: sql_common:: SQLCommon ;
150
+
151
+ const LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT : & str =
152
+ "INSERT INTO `test` VALUES ('abcd', 'def'),('abcd', 'def'),('abcé', 'def');" ;
153
+
154
+ #[ test]
155
+ #[ should_panic]
156
+ fn test_invalid_string_truncation ( ) {
157
+ // This test checks the INSERT statement did panic with byte truncated string.
158
+ // We need to do this to validate that the code of short_sql has fixed this panic!
159
+ format ! ( "{}..." , & LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT [ ..64 ] ) ;
160
+ }
161
+
162
+ #[ test]
163
+ fn test_short_sql_truncation_on_unicode ( ) {
164
+ // short insert into statements are not truncated
165
+ assert_eq ! (
166
+ SQLCommon :: short_sql( "INSERT INTO `test` VALUES('abcd', 'def');" ) ,
167
+ "INSERT INTO `test` VALUES('abcd', 'def');"
168
+ ) ;
169
+ // long one are at 64th char...
170
+ let shortned = SQLCommon :: short_sql ( LONG_INSERT_WITH_UNICODE_AT_TRUNCATION_POINT ) ;
171
+ assert_eq ! ( shortned. len( ) , 68 ) ; // 64 chars with a multibyte one (é) + ...
172
+ assert_eq ! (
173
+ shortned,
174
+ "INSERT INTO `test` VALUES ('abcd', 'def'),('abcd', 'def'),('abcé..."
175
+ ) ;
176
+ }
177
+ }
0 commit comments