Skip to content

Commit 2706eed

Browse files
authored
[YQL-17103] Support StartsWith with pg types in extract_predicate library (#854)
* support StartsWith predicates for pg types * add test for string lookup with pg * add docs * fix pg type name
1 parent e95f266 commit 2706eed

File tree

13 files changed

+306
-16
lines changed

13 files changed

+306
-16
lines changed

ydb/docs/ru/core/yql/reference/yql-core/builtins/_includes/basic/starts_ends_with.md

+7-9
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,18 @@
44

55
**Сигнатуры**
66
```
7-
StartsWith(Utf8, Utf8)->Bool
8-
StartsWith(Utf8[?], Utf8[?])->Bool?
9-
StartsWith(String, String)->Bool
10-
StartsWith(String[?], String[?])->Bool?
7+
StartsWith(T str, U prefix)->Bool[?]
118
12-
EndsWith(Utf8, Utf8)->Bool
13-
EndsWith(Utf8[?], Utf8[?])->Bool?
14-
EndsWith(String, String)->Bool
15-
EndsWith(String[?], String[?])->Bool?
9+
EndsWith(T str, U suffix)->Bool[?]
1610
```
1711

1812
Обязательные аргументы:
1913

2014
* Исходная строка;
2115
* Искомая подстрока.
2216

23-
Аргументы могут быть типов `String` или `Utf8` и могут быть опциональными.
17+
Аргументы должны иметь тип `String`/`Utf8` (или опциональный String`/`Utf8`) либо строковый PostgreSQL тип (`PgText`/`PgBytea`/`PgVarchar`).
18+
Результатом функции является опциональный Bool, за исключением случая, когда оба аргумента неопциональные – в этом случае возвращается Bool.
2419

2520
**Примеры**
2621
``` yql
@@ -35,3 +30,6 @@ SELECT StartsWith("abcd", NULL); -- null
3530
``` yql
3631
SELECT EndsWith(NULL, Utf8("")); -- null
3732
```
33+
``` yql
34+
SELECT StartsWith("abc_efg"u, "abc"p) AND EndsWith("abc_efg", "efg"pv); -- true
35+
```

ydb/library/yql/core/common_opt/yql_co_simple1.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -4665,6 +4665,16 @@ void RegisterCoSimpleCallables1(TCallableOptimizerMap& map) {
46654665
map["IsDistinctFrom"] = std::bind(&OptimizeDistinctFrom<false>, _1, _2);
46664666

46674667
map["StartsWith"] = map["EndsWith"] = map["StringContains"] = [](const TExprNode::TPtr& node, TExprContext& ctx, TOptimizeContext& /*optCtx*/) {
4668+
if (node->Head().GetTypeAnn()->GetKind() == ETypeAnnotationKind::Pg || node->Tail().GetTypeAnn()->GetKind() == ETypeAnnotationKind::Pg) {
4669+
TExprNodeList converted;
4670+
for (auto& child : node->ChildrenList()) {
4671+
const bool isPg = child->GetTypeAnn()->GetKind() == ETypeAnnotationKind::Pg;
4672+
converted.emplace_back(ctx.WrapByCallableIf(isPg, "FromPg", std::move(child)));
4673+
}
4674+
YQL_CLOG(DEBUG, Core) << "Converting Pg strings to YQL strings in " << node->Content();
4675+
return ctx.ChangeChildren(*node, std::move(converted));
4676+
}
4677+
46684678
if (node->Tail().IsCallable("String") && node->Tail().Head().Content().empty()) {
46694679
YQL_CLOG(DEBUG, Core) << node->Content() << " with empty string in second argument";
46704680
if (node->GetTypeAnn()->GetKind() == ETypeAnnotationKind::Optional) {

ydb/library/yql/core/extract_predicate/extract_predicate_impl.cpp

+23-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "extract_predicate_impl.h"
22

3+
#include <ydb/library/yql/core/type_ann/type_ann_pg.h>
34
#include <ydb/library/yql/core/yql_expr_type_annotation.h>
45
#include <ydb/library/yql/core/yql_opt_utils.h>
56
#include <ydb/library/yql/core/yql_expr_constraint.h>
@@ -781,6 +782,17 @@ TExprNode::TPtr OptimizeNodeForRangeExtraction(const TExprNode::TPtr& node, cons
781782
}
782783
}
783784

785+
if (node->IsCallable("StartsWith")) {
786+
if (node->Head().IsCallable("FromPg")) {
787+
YQL_CLOG(DEBUG, Core) << "Get rid of FromPg() in " << node->Content() << " first argument";
788+
return ctx.ChangeChild(*node, 0, node->Head().HeadPtr());
789+
}
790+
if (node->Tail().GetTypeAnn()->GetKind() == ETypeAnnotationKind::Pg) {
791+
YQL_CLOG(DEBUG, Core) << "Convert second argument of " << node->Content() << " from PG type";
792+
return ctx.ChangeChild(*node, 1, ctx.NewCallable(node->Tail().Pos(), "FromPg", {node->TailPtr()}));
793+
}
794+
}
795+
784796
return node;
785797
}
786798

@@ -911,13 +923,22 @@ TExprNode::TPtr BuildSingleComputeRange(const TStructExprType& rowType,
911923

912924
if (opNode->IsCallable("StartsWith")) {
913925
YQL_ENSURE(keys.size() == 1);
914-
return ctx.Builder(pos)
926+
const bool keyIsPg = firstKeyType->GetKind() == ETypeAnnotationKind::Pg;
927+
const TTypeAnnotationNode* rangeForType = firstKeyType;
928+
if (keyIsPg) {
929+
const TTypeAnnotationNode* yqlType = NTypeAnnImpl::FromPgImpl(pos, firstKeyType, ctx);
930+
YQL_ENSURE(yqlType);
931+
rangeForType = yqlType;
932+
YQL_ENSURE(opNode->Tail().GetTypeAnn()->GetKind() != ETypeAnnotationKind::Pg);
933+
}
934+
auto rangeForNode = ctx.Builder(pos)
915935
.Callable("RangeFor")
916936
.Atom(0, hasNot ? "NotStartsWith" : "StartsWith", TNodeFlags::Default)
917937
.Add(1, opNode->TailPtr())
918-
.Add(2, ExpandType(pos, *firstKeyType, ctx))
938+
.Add(2, ExpandType(pos, *rangeForType, ctx))
919939
.Seal()
920940
.Build();
941+
return ctx.WrapByCallableIf(keyIsPg, "RangeToPg", std::move(rangeForNode));
921942
}
922943

923944
if (opNode->IsCallable("SqlIn")) {

ydb/library/yql/core/extract_predicate/ya.make

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ SRCS(
1010

1111
PEERDIR(
1212
ydb/library/yql/core/services
13+
ydb/library/yql/core/type_ann
1314
)
1415

1516
YQL_LAST_ABI_VERSION()

ydb/library/yql/core/peephole_opt/yql_opt_peephole_physical.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -7602,6 +7602,7 @@ struct TPeepHoleRules {
76027602
{"RangeEmpty", &ExpandRangeEmpty},
76037603
{"AsRange", &ExpandAsRange},
76047604
{"RangeFor", &ExpandRangeFor},
7605+
{"RangeToPg", &ExpandRangeToPg},
76057606
{"ToFlow", &DropToFlowDeps},
76067607
{"CheckedAdd", &ExpandCheckedAdd},
76077608
{"CheckedSub", &ExpandCheckedSub},

ydb/library/yql/core/type_ann/type_ann_core.cpp

+66-5
Original file line numberDiff line numberDiff line change
@@ -3225,14 +3225,32 @@ namespace NTypeAnnImpl {
32253225
return IGraphTransformer::TStatus::Repeat;
32263226
}
32273227

3228-
bool isOptional1, isOptional2;
3229-
if (const TDataExprType *dataTypeOne, *dataTypeTwo;
3230-
!(EnsureDataOrOptionalOfData(input->Head(), isOptional1, dataTypeOne, ctx.Expr) && EnsureDataOrOptionalOfData(input->Tail(), isOptional2, dataTypeTwo, ctx.Expr)
3231-
&& EnsureStringOrUtf8Type(input->Head().Pos(), *dataTypeOne, ctx.Expr) && EnsureStringOrUtf8Type(input->Tail().Pos(), *dataTypeTwo, ctx.Expr))) {
3228+
if (!EnsureComputable(input->Head(), ctx.Expr) || !EnsureComputable(input->Tail(), ctx.Expr)) {
32323229
return IGraphTransformer::TStatus::Error;
32333230
}
32343231

3235-
if (isOptional1 || isOptional2)
3232+
bool hasOptionals = false;
3233+
for (auto& child : input->ChildrenList()) {
3234+
const TTypeAnnotationNode* type = child->GetTypeAnn();
3235+
if (type->GetKind() == ETypeAnnotationKind::Pg) {
3236+
type = FromPgImpl(child->Pos(), type, ctx.Expr);
3237+
if (!type) {
3238+
return IGraphTransformer::TStatus::Error;
3239+
}
3240+
}
3241+
bool isOptional = false;
3242+
const TDataExprType* dataType = nullptr;
3243+
if (!IsDataOrOptionalOfData(type, isOptional, dataType) ||
3244+
!(dataType->GetSlot() == EDataSlot::String || dataType->GetSlot() == EDataSlot::Utf8))
3245+
{
3246+
ctx.Expr.AddError(TIssue(ctx.Expr.GetPosition(child->Pos()), TStringBuilder()
3247+
<< "Expected (optional) string/utf8 or corresponding Pg type, but got: " << *child->GetTypeAnn()));
3248+
return IGraphTransformer::TStatus::Error;
3249+
}
3250+
hasOptionals = hasOptionals || isOptional;
3251+
}
3252+
3253+
if (hasOptionals)
32363254
input->SetTypeAnn(ctx.Expr.MakeType<TOptionalExprType>(ctx.Expr.MakeType<TDataExprType>(EDataSlot::Bool)));
32373255
else
32383256
input->SetTypeAnn(ctx.Expr.MakeType<TDataExprType>(EDataSlot::Bool));
@@ -11107,6 +11125,48 @@ template <NKikimr::NUdf::EDataSlot DataSlot>
1110711125
return IGraphTransformer::TStatus::Ok;
1110811126
}
1110911127

11128+
IGraphTransformer::TStatus RangeToPgWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx) {
11129+
Y_UNUSED(output);
11130+
11131+
if (!EnsureArgsCount(*input, 1, ctx.Expr)) {
11132+
return IGraphTransformer::TStatus::Error;
11133+
}
11134+
11135+
if (!EnsureListType(input->Head(), ctx.Expr)) {
11136+
return IGraphTransformer::TStatus::Error;
11137+
}
11138+
11139+
auto argType = input->Head().GetTypeAnn();
11140+
auto rangeType = argType->Cast<TListExprType>()->GetItemType();
11141+
if (!EnsureValidRange(input->Head().Pos(), rangeType, ctx.Expr)) {
11142+
return IGraphTransformer::TStatus::Error;
11143+
}
11144+
11145+
auto boundaryType = rangeType->Cast<TTupleExprType>()->GetItems().front();
11146+
const auto& boundaryItems = boundaryType->Cast<TTupleExprType>()->GetItems();
11147+
11148+
TTypeAnnotationNode::TListType resultBoundaryItems;
11149+
resultBoundaryItems.reserve(boundaryItems.size());
11150+
for (size_t i = 0; i < boundaryItems.size(); ++i) {
11151+
if (i % 2 == 0) {
11152+
resultBoundaryItems.push_back(boundaryItems[i]);
11153+
} else {
11154+
auto keyType = boundaryItems[i]->Cast<TOptionalExprType>()->GetItemType();
11155+
auto pgKeyType = ToPgImpl(input->Head().Pos(), keyType, ctx.Expr);
11156+
if (!pgKeyType) {
11157+
return IGraphTransformer::TStatus::Error;
11158+
}
11159+
resultBoundaryItems.push_back(ctx.Expr.MakeType<TOptionalExprType>(pgKeyType));
11160+
}
11161+
}
11162+
11163+
const TTypeAnnotationNode* resultBoundaryType = ctx.Expr.MakeType<TTupleExprType>(resultBoundaryItems);
11164+
const TTypeAnnotationNode* resultRangeType =
11165+
ctx.Expr.MakeType<TTupleExprType>(TTypeAnnotationNode::TListType{resultBoundaryType, resultBoundaryType});
11166+
input->SetTypeAnn(ctx.Expr.MakeType<TListExprType>(resultRangeType));
11167+
return IGraphTransformer::TStatus::Ok;
11168+
}
11169+
1111011170
IGraphTransformer::TStatus RangeCreateWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx) {
1111111171
Y_UNUSED(output);
1111211172

@@ -12164,6 +12224,7 @@ template <NKikimr::NUdf::EDataSlot DataSlot>
1216412224
ExtFunctions["OrderedSqlRename"] = &SqlRenameWrapper;
1216512225

1216612226
Functions["AsRange"] = &AsRangeWrapper;
12227+
Functions["RangeToPg"] = &RangeToPgWrapper;
1216712228
Functions["RangeCreate"] = &RangeCreateWrapper;
1216812229
Functions["RangeEmpty"] = &RangeEmptyWrapper;
1216912230
Functions["RangeFor"] = &RangeForWrapper;

ydb/library/yql/core/yql_opt_range.cpp

+47
Original file line numberDiff line numberDiff line change
@@ -519,4 +519,51 @@ TExprNode::TPtr ExpandRangeFor(const TExprNode::TPtr& node, TExprContext& ctx) {
519519
return result;
520520
}
521521

522+
TExprNode::TPtr ExpandRangeToPg(const TExprNode::TPtr& node, TExprContext& ctx) {
523+
YQL_ENSURE(node->IsCallable("RangeToPg"));
524+
const size_t numComponents = node->Head().GetTypeAnn()->Cast<TListExprType>()->GetItemType()->
525+
Cast<TTupleExprType>()->GetItems().front()->Cast<TTupleExprType>()->GetSize();
526+
return ctx.Builder(node->Pos())
527+
.Callable("OrderedMap")
528+
.Add(0, node->HeadPtr())
529+
.Lambda(1)
530+
.Param("range")
531+
.Callable("StaticMap")
532+
.Arg(0, "range")
533+
.Lambda(1)
534+
.Param("boundary")
535+
.List()
536+
.Do([&](TExprNodeBuilder& parent) -> TExprNodeBuilder& {
537+
for (size_t i = 0; i < numComponents; ++i) {
538+
if (i % 2 == 0) {
539+
parent
540+
.Callable(i, "Nth")
541+
.Arg(0, "boundary")
542+
.Atom(1, i)
543+
.Seal();
544+
} else {
545+
parent
546+
.Callable(i, "Map")
547+
.Callable(0, "Nth")
548+
.Arg(0, "boundary")
549+
.Atom(1, i)
550+
.Seal()
551+
.Lambda(1)
552+
.Param("unwrapped")
553+
.Callable("ToPg")
554+
.Arg(0, "unwrapped")
555+
.Seal()
556+
.Seal()
557+
.Seal();
558+
}
559+
}
560+
return parent;
561+
})
562+
.Seal()
563+
.Seal()
564+
.Seal()
565+
.Seal()
566+
.Seal()
567+
.Build();
568+
}
522569
}

ydb/library/yql/core/yql_opt_range.h

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ namespace NYql {
66
TExprNode::TPtr ExpandRangeEmpty(const TExprNode::TPtr& node, TExprContext& ctx);
77
TExprNode::TPtr ExpandAsRange(const TExprNode::TPtr& node, TExprContext& ctx);
88
TExprNode::TPtr ExpandRangeFor(const TExprNode::TPtr& node, TExprContext& ctx);
9+
TExprNode::TPtr ExpandRangeToPg(const TExprNode::TPtr& node, TExprContext& ctx);
910

1011
}
1112

ydb/library/yql/tests/sql/dq_file/part5/canondata/result.json

+22
Original file line numberDiff line numberDiff line change
@@ -2210,6 +2210,28 @@
22102210
}
22112211
],
22122212
"test.test[pg-select_win_count-default.txt-Results]": [],
2213+
"test.test[pg-str_lookup_pg-default.txt-Analyze]": [
2214+
{
2215+
"checksum": "a48ccc9922567dfee1170d2c2df45b6e",
2216+
"size": 2153,
2217+
"uri": "https://{canondata_backend}/1784826/cbc63541f63d78da712c6e11ae70c4ee10dfb428/resource.tar.gz#test.test_pg-str_lookup_pg-default.txt-Analyze_/plan.txt"
2218+
}
2219+
],
2220+
"test.test[pg-str_lookup_pg-default.txt-Debug]": [
2221+
{
2222+
"checksum": "851bbcc3bbf2c5f21c51a7d61851aba1",
2223+
"size": 1657,
2224+
"uri": "https://{canondata_backend}/1784826/cbc63541f63d78da712c6e11ae70c4ee10dfb428/resource.tar.gz#test.test_pg-str_lookup_pg-default.txt-Debug_/opt.yql_patched"
2225+
}
2226+
],
2227+
"test.test[pg-str_lookup_pg-default.txt-Plan]": [
2228+
{
2229+
"checksum": "a48ccc9922567dfee1170d2c2df45b6e",
2230+
"size": 2153,
2231+
"uri": "https://{canondata_backend}/1784826/cbc63541f63d78da712c6e11ae70c4ee10dfb428/resource.tar.gz#test.test_pg-str_lookup_pg-default.txt-Plan_/plan.txt"
2232+
}
2233+
],
2234+
"test.test[pg-str_lookup_pg-default.txt-Results]": [],
22132235
"test.test[pg-sublink_order_any_corr-default.txt-Analyze]": [
22142236
{
22152237
"checksum": "b4dd508a329723c74293d80f0278c705",

ydb/library/yql/tests/sql/sql2yql/canondata/result.json

+28
Original file line numberDiff line numberDiff line change
@@ -3947,6 +3947,13 @@
39473947
"uri": "https://{canondata_backend}/1773845/fe2146df711e0729e3c3cc1bc9b2c5b1fdfcfea1/resource.tar.gz#test_sql2yql.test_compute_range-pg_sqlin_/sql.yql"
39483948
}
39493949
],
3950+
"test_sql2yql.test[compute_range-pg_startswith]": [
3951+
{
3952+
"checksum": "f2e42e95b7b84fd210244e0c61c3f614",
3953+
"size": 4450,
3954+
"uri": "https://{canondata_backend}/1031349/96841816c51116681477e138bb81b6493013c777/resource.tar.gz#test_sql2yql.test_compute_range-pg_startswith_/sql.yql"
3955+
}
3956+
],
39503957
"test_sql2yql.test[compute_range-preserve_rest_predicates_order]": [
39513958
{
39523959
"checksum": "4915841ad83886d7f63fe939e0848687",
@@ -12067,6 +12074,13 @@
1206712074
"uri": "https://{canondata_backend}/1599023/af9c2f81df0601cf266a0926b5ce73b6101b9115/resource.tar.gz#test_sql2yql.test_pg-single_input_filter_over_join_/sql.yql"
1206812075
}
1206912076
],
12077+
"test_sql2yql.test[pg-str_lookup_pg]": [
12078+
{
12079+
"checksum": "15ae2647f3110534a4e0e10d89a19e35",
12080+
"size": 6373,
12081+
"uri": "https://{canondata_backend}/1775059/5625478e977a363be64a17bebddbd8ed18706eac/resource.tar.gz#test_sql2yql.test_pg-str_lookup_pg_/sql.yql"
12082+
}
12083+
],
1207012084
"test_sql2yql.test[pg-struct_tuple_cast]": [
1207112085
{
1207212086
"checksum": "e99eaf940d72eb246c5fe60c7f2f687d",
@@ -21251,6 +21265,13 @@
2125121265
"uri": "https://{canondata_backend}/1773845/fe2146df711e0729e3c3cc1bc9b2c5b1fdfcfea1/resource.tar.gz#test_sql_format.test_compute_range-pg_sqlin_/formatted.sql"
2125221266
}
2125321267
],
21268+
"test_sql_format.test[compute_range-pg_startswith]": [
21269+
{
21270+
"checksum": "b06b88f1965f643fea24cb7e5d8d0459",
21271+
"size": 955,
21272+
"uri": "https://{canondata_backend}/1031349/96841816c51116681477e138bb81b6493013c777/resource.tar.gz#test_sql_format.test_compute_range-pg_startswith_/formatted.sql"
21273+
}
21274+
],
2125421275
"test_sql_format.test[compute_range-preserve_rest_predicates_order]": [
2125521276
{
2125621277
"checksum": "77cd36176a336f2a79ee10f5697b124f",
@@ -28055,6 +28076,13 @@
2805528076
"uri": "https://{canondata_backend}/1880306/64654158d6bfb1289c66c626a8162239289559d0/resource.tar.gz#test_sql_format.test_pg-simple_ops_/formatted.sql"
2805628077
}
2805728078
],
28079+
"test_sql_format.test[pg-str_lookup_pg]": [
28080+
{
28081+
"checksum": "f1954f2bb0c2bf59abe9752284f424cc",
28082+
"size": 637,
28083+
"uri": "https://{canondata_backend}/1775059/5625478e977a363be64a17bebddbd8ed18706eac/resource.tar.gz#test_sql_format.test_pg-str_lookup_pg_/formatted.sql"
28084+
}
28085+
],
2805828086
"test_sql_format.test[pg-struct_tuple_cast]": [
2805928087
{
2806028088
"checksum": "d77766b8458d94c8c4af56c3d439d2dd",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/* syntax version 1 */
2+
/* postgres can not */
3+
/* dq can not */
4+
/* dqfile can not */
5+
/* yt can not */
6+
pragma warning("disable", "4510");
7+
pragma warning("disable", "1108");
8+
9+
-- like 'aaaa'
10+
select YQL::RangeComputeFor(
11+
Struct<a:PgInt4,b:PgText>,
12+
($row) -> (StartsWith(FromPg($row.b), 'aaaa') ?? false),
13+
AsTuple(AsAtom("b"))
14+
);
15+
16+
-- not like 'aaaa'
17+
select YQL::RangeComputeFor(
18+
Struct<a:PgInt4,b:PgText>,
19+
($row) -> (not (StartsWith(FromPg($row.b), 'aaaa') ?? true)),
20+
AsTuple(AsAtom("b"))
21+
);
22+
23+
24+
-- like <invalid utf8>
25+
select YQL::RangeComputeFor(
26+
Struct<a:PgInt4,b:PgText>,
27+
($row) -> (StartsWith(FromPg($row.b), 'a\xf5') ?? false),
28+
AsTuple(AsAtom("b"))
29+
);
30+
31+
-- not like <invalid utf8>
32+
select YQL::RangeComputeFor(
33+
Struct<a:PgInt4,b:PgText>,
34+
($row) -> (not (StartsWith(FromPg($row.b), 'a\xf5') ?? true)),
35+
AsTuple(AsAtom("b"))
36+
);

0 commit comments

Comments
 (0)