Skip to content

Commit d030920

Browse files
authored
YQL-17725 unicode string literals (#1528)
* init * fix
1 parent 366a347 commit d030920

File tree

12 files changed

+142
-7
lines changed

12 files changed

+142
-7
lines changed

ydb/library/yql/core/issue/protos/issue_id.proto

+1
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ message TIssuesIds {
141141
YQL_OFFSET_WITHOUT_SORT = 4537;
142142
YQL_DEPRECATED_BINDINGS = 4538;
143143
YQL_HINT_INVALID_PARAMETERS = 4539;
144+
YQL_UNTYPED_STRING_LITERALS = 4540;
144145

145146
// yql parser errors
146147
YQL_NOT_ALLOWED_IN_DISCOVERY = 4600;

ydb/library/yql/core/issue/yql_issue.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -646,4 +646,8 @@ ids {
646646
ids {
647647
code: PG_NO_LOCKING_SUPPORT
648648
severity: S_WARNING
649-
}
649+
}
650+
ids {
651+
code: YQL_UNTYPED_STRING_LITERALS
652+
severity: S_WARNING
653+
}

ydb/library/yql/sql/settings/translation_settings.h

+1
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ namespace NSQLTranslation {
114114
THashSet<TString> AutoParametrizeExprDisabledScopes = {};
115115

116116
TGUCSettings::TPtr GUCSettings = std::make_shared<TGUCSettings>();
117+
bool UnicodeLiterals = false;
117118
};
118119

119120
bool ParseTranslationSettings(const TString& query, NSQLTranslation::TTranslationSettings& settings, NYql::TIssues& issues);

ydb/library/yql/sql/v1/SQLv1.g.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -1783,7 +1783,7 @@ fragment STRING_SINGLE: (QUOTE_SINGLE STRING_CORE_SINGLE* QUOTE_SINGLE);
17831783
fragment STRING_DOUBLE: (QUOTE_DOUBLE STRING_CORE_DOUBLE* QUOTE_DOUBLE);
17841784
fragment STRING_MULTILINE: (DOUBLE_AT .* DOUBLE_AT)+ AT?;
17851785

1786-
STRING_VALUE: ((STRING_SINGLE | STRING_DOUBLE | STRING_MULTILINE) (U | Y | J | P (T | B | V)?)?);
1786+
STRING_VALUE: ((STRING_SINGLE | STRING_DOUBLE | STRING_MULTILINE) (S | B | T | U | Y | J | P (T | B | V)?)?);
17871787

17881788
ID_PLAIN: ('a'..'z' | 'A'..'Z' | '_') ('a'..'z' | 'A'..'Z' | '_' | DIGIT)*;
17891789

ydb/library/yql/sql/v1/context.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ TContext::TContext(const NSQLTranslation::TTranslationSettings& settings,
9494

9595
Scoped = MakeIntrusive<TScopedState>();
9696
AllScopes.push_back(Scoped);
97+
Scoped->UnicodeLiterals = settings.UnicodeLiterals;
9798
if (settings.DefaultCluster) {
9899
Scoped->CurrCluster = TDeferredAtom({}, settings.DefaultCluster);
99100
auto provider = GetClusterProvider(settings.DefaultCluster);

ydb/library/yql/sql/v1/context.h

+2
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ namespace NSQLTranslationV1 {
5050
bool PragmaClassicDivision = true;
5151
bool PragmaCheckedOps = false;
5252
bool StrictJoinKeyTypes = false;
53+
bool UnicodeLiterals = false;
54+
bool WarnUntypedStringLiterals = false;
5355
TNamedNodesMap NamedNodes;
5456

5557
struct TLocal {

ydb/library/yql/sql/v1/node.cpp

+16-4
Original file line numberDiff line numberDiff line change
@@ -1406,10 +1406,7 @@ StringContentInternal(TContext& ctx, TPosition pos, const TString& input, EStrin
14061406
TString str = input;
14071407
if (mode == EStringContentMode::TypedStringLiteral) {
14081408
auto lower = to_lower(str);
1409-
if (lower.EndsWith("u")) {
1410-
str = str.substr(0, str.Size() - 1);
1411-
result.Type = NKikimr::NUdf::EDataSlot::Utf8;
1412-
} else if (lower.EndsWith("y")) {
1409+
if (lower.EndsWith("y")) {
14131410
str = str.substr(0, str.Size() - 1);
14141411
result.Type = NKikimr::NUdf::EDataSlot::Yson;
14151412
} else if (lower.EndsWith("j")) {
@@ -1427,6 +1424,21 @@ StringContentInternal(TContext& ctx, TPosition pos, const TString& input, EStrin
14271424
} else if (lower.EndsWith("pv")) {
14281425
str = str.substr(0, str.Size() - 2);
14291426
result.PgType = "PgVarchar";
1427+
} else if (lower.EndsWith("s") || lower.EndsWith("b")) {
1428+
str = str.substr(0, str.Size() - 1);
1429+
result.Type = NKikimr::NUdf::EDataSlot::String;
1430+
} else if (lower.EndsWith("u") || lower.EndsWith("t")) {
1431+
str = str.substr(0, str.Size() - 1);
1432+
result.Type = NKikimr::NUdf::EDataSlot::Utf8;
1433+
} else {
1434+
if (ctx.Scoped->WarnUntypedStringLiterals) {
1435+
ctx.Warning(pos, TIssuesIds::YQL_UNTYPED_STRING_LITERALS)
1436+
<< "Please add suffix u or t for Utf8 strings or s or b for arbitrary binary strings";
1437+
}
1438+
1439+
if (ctx.Scoped->UnicodeLiterals) {
1440+
result.Type = NKikimr::NUdf::EDataSlot::Utf8;
1441+
}
14301442
}
14311443
}
14321444

ydb/library/yql/sql/v1/sql_query.cpp

+22-1
Original file line numberDiff line numberDiff line change
@@ -1620,7 +1620,16 @@ TNodePtr TSqlQuery::PragmaStatement(const TRule_pragma_stmt& stmt, bool& success
16201620
}
16211621

16221622
const bool withConfigure = prefix || normalizedPragma == "file" || normalizedPragma == "folder" || normalizedPragma == "udf";
1623-
static const THashSet<TStringBuf> lexicalScopePragmas = {"classicdivision", "strictjoinkeytypes", "disablestrictjoinkeytypes", "checkedops"};
1623+
static const THashSet<TStringBuf> lexicalScopePragmas = {
1624+
"classicdivision",
1625+
"strictjoinkeytypes",
1626+
"disablestrictjoinkeytypes",
1627+
"checkedops",
1628+
"unicodeliterals",
1629+
"disableunicodeliterals",
1630+
"warnuntypedstringliterals",
1631+
"disableuntypedstringliterals",
1632+
};
16241633
const bool hasLexicalScope = withConfigure || lexicalScopePragmas.contains(normalizedPragma);
16251634
const bool withFileAlias = normalizedPragma == "file" || normalizedPragma == "folder" || normalizedPragma == "library" || normalizedPragma == "udf";
16261635
for (auto pragmaValue : pragmaValues) {
@@ -2190,6 +2199,18 @@ TNodePtr TSqlQuery::PragmaStatement(const TRule_pragma_stmt& stmt, bool& success
21902199
} else if (normalizedPragma == "disablestrictjoinkeytypes") {
21912200
Ctx.Scoped->StrictJoinKeyTypes = false;
21922201
Ctx.IncrementMonCounter("sql_pragma", "DisableStrictJoinKeyTypes");
2202+
} else if (normalizedPragma == "unicodeliterals") {
2203+
Ctx.Scoped->UnicodeLiterals = true;
2204+
Ctx.IncrementMonCounter("sql_pragma", "UnicodeLiterals");
2205+
} else if (normalizedPragma == "disableunicodeliterals") {
2206+
Ctx.Scoped->UnicodeLiterals = false;
2207+
Ctx.IncrementMonCounter("sql_pragma", "DisableUnicodeLiterals");
2208+
} else if (normalizedPragma == "warnuntypedstringliterals") {
2209+
Ctx.Scoped->WarnUntypedStringLiterals = true;
2210+
Ctx.IncrementMonCounter("sql_pragma", "WarnUntypedStringLiterals");
2211+
} else if (normalizedPragma == "disablewarnuntypedstringliterals") {
2212+
Ctx.Scoped->WarnUntypedStringLiterals = false;
2213+
Ctx.IncrementMonCounter("sql_pragma", "DisableWarnUntypedStringLiterals");
21932214
} else if (normalizedPragma == "unorderedsubqueries") {
21942215
Ctx.UnorderedSubqueries = true;
21952216
Ctx.IncrementMonCounter("sql_pragma", "UnorderedSubqueries");

ydb/library/yql/tests/sql/dq_file/part14/canondata/result.json

+29
Original file line numberDiff line numberDiff line change
@@ -1011,6 +1011,35 @@
10111011
}
10121012
],
10131013
"test.test[expr-to_sorted_set_tuple_key-default.txt-Results]": [],
1014+
"test.test[expr-unicode_literals-default.txt-Analyze]": [
1015+
{
1016+
"checksum": "a3b64a2cf9903b3868a2dd88a18fc46e",
1017+
"size": 922,
1018+
"uri": "https://{canondata_backend}/1871002/fb6fb37c565974a6f0c497e8b3e58f6b5bf320b2/resource.tar.gz#test.test_expr-unicode_literals-default.txt-Analyze_/plan.txt"
1019+
},
1020+
{
1021+
"uri": "file://test.test_expr-unicode_literals-default.txt-Analyze_/extracted"
1022+
}
1023+
],
1024+
"test.test[expr-unicode_literals-default.txt-Debug]": [
1025+
{
1026+
"checksum": "9201dbe44a3334deb0a063d58468a160",
1027+
"size": 522,
1028+
"uri": "https://{canondata_backend}/1871002/fb6fb37c565974a6f0c497e8b3e58f6b5bf320b2/resource.tar.gz#test.test_expr-unicode_literals-default.txt-Debug_/opt.yql_patched"
1029+
}
1030+
],
1031+
"test.test[expr-unicode_literals-default.txt-Plan]": [
1032+
{
1033+
"checksum": "a3b64a2cf9903b3868a2dd88a18fc46e",
1034+
"size": 922,
1035+
"uri": "https://{canondata_backend}/1871002/fb6fb37c565974a6f0c497e8b3e58f6b5bf320b2/resource.tar.gz#test.test_expr-unicode_literals-default.txt-Plan_/plan.txt"
1036+
}
1037+
],
1038+
"test.test[expr-unicode_literals-default.txt-Results]": [
1039+
{
1040+
"uri": "file://test.test_expr-unicode_literals-default.txt-Results_/extracted"
1041+
}
1042+
],
10141043
"test.test[expr-variant_tuple_comp-default.txt-Analyze]": [
10151044
{
10161045
"checksum": "01775e7c945a56ebf0edc2d478f4f68d",

ydb/library/yql/tests/sql/sql2yql/canondata/result.json

+14
Original file line numberDiff line numberDiff line change
@@ -5795,6 +5795,13 @@
57955795
"uri": "https://{canondata_backend}/1871182/6b10ad6d9884e5faf3a77187ffb9b38b59b46458/resource.tar.gz#test_sql2yql.test_expr-udaf_with_list_zip_/sql.yql"
57965796
}
57975797
],
5798+
"test_sql2yql.test[expr-unicode_literals]": [
5799+
{
5800+
"checksum": "9be93914e3d28b675e0eee080ef248ec",
5801+
"size": 1964,
5802+
"uri": "https://{canondata_backend}/1937367/9f749035d8f07b7ae5537f5aebd224641b378134/resource.tar.gz#test_sql2yql.test_expr-unicode_literals_/sql.yql"
5803+
}
5804+
],
57985805
"test_sql2yql.test[expr-untag]": [
57995806
{
58005807
"checksum": "e83bb3d6e0abd1069a2c5e30a7ec6409",
@@ -23267,6 +23274,13 @@
2326723274
"uri": "https://{canondata_backend}/1880306/64654158d6bfb1289c66c626a8162239289559d0/resource.tar.gz#test_sql_format.test_expr-udaf_with_list_zip_/formatted.sql"
2326823275
}
2326923276
],
23277+
"test_sql_format.test[expr-unicode_literals]": [
23278+
{
23279+
"checksum": "b470490a33e28dd2537f12d80329216a",
23280+
"size": 374,
23281+
"uri": "https://{canondata_backend}/1937367/9f749035d8f07b7ae5537f5aebd224641b378134/resource.tar.gz#test_sql_format.test_expr-unicode_literals_/formatted.sql"
23282+
}
23283+
],
2327023284
"test_sql_format.test[expr-untag]": [
2327123285
{
2327223286
"checksum": "af1b548d1c51945be876993b053bcc11",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
pragma WarnUntypedStringLiterals;
2+
pragma UnicodeLiterals;
3+
$f = ()->{
4+
return (
5+
"a"s,
6+
"b"b,
7+
"c"t,
8+
"d"u,
9+
"e");
10+
};
11+
12+
select $f();
13+
14+
pragma DisableWarnUntypedStringLiterals;
15+
pragma DisableUnicodeLiterals;
16+
$g = ()->{
17+
return (
18+
"a"s,
19+
"b"b,
20+
"c"t,
21+
"d"u,
22+
"e");
23+
};
24+
25+
select $g();
26+

ydb/library/yql/tests/sql/yt_native_file/part14/canondata/result.json

+24
Original file line numberDiff line numberDiff line change
@@ -1025,6 +1025,30 @@
10251025
"uri": "https://{canondata_backend}/1937367/40af353047a2965dc4907c6a6b7a0b86a14045dd/resource.tar.gz#test.test_expr-to_sorted_set_tuple_key-default.txt-Results_/results.txt"
10261026
}
10271027
],
1028+
"test.test[expr-unicode_literals-default.txt-Debug]": [
1029+
{
1030+
"checksum": "b21fde16b24ef5500d9c21f811fc800b",
1031+
"size": 452,
1032+
"uri": "https://{canondata_backend}/1942671/fe81aca6675f95264895c6b4c3bafedf6b92cfd5/resource.tar.gz#test.test_expr-unicode_literals-default.txt-Debug_/opt.yql"
1033+
}
1034+
],
1035+
"test.test[expr-unicode_literals-default.txt-Plan]": [
1036+
{
1037+
"checksum": "a3b64a2cf9903b3868a2dd88a18fc46e",
1038+
"size": 922,
1039+
"uri": "https://{canondata_backend}/1942671/fe81aca6675f95264895c6b4c3bafedf6b92cfd5/resource.tar.gz#test.test_expr-unicode_literals-default.txt-Plan_/plan.txt"
1040+
}
1041+
],
1042+
"test.test[expr-unicode_literals-default.txt-Results]": [
1043+
{
1044+
"checksum": "634838888e147228dfbca0438c1c75d5",
1045+
"size": 3698,
1046+
"uri": "https://{canondata_backend}/1942671/fe81aca6675f95264895c6b4c3bafedf6b92cfd5/resource.tar.gz#test.test_expr-unicode_literals-default.txt-Results_/results.txt"
1047+
},
1048+
{
1049+
"uri": "file://test.test_expr-unicode_literals-default.txt-Results_/extracted"
1050+
}
1051+
],
10281052
"test.test[expr-variant_tuple_comp-default.txt-Debug]": [
10291053
{
10301054
"checksum": "535e6582b45481ccb48fdce0a827a92d",

0 commit comments

Comments
 (0)