Skip to content

Commit 1e656fe

Browse files
authored
[CBO] Fix Selectivity (#6645)
1 parent e53fe59 commit 1e656fe

File tree

4 files changed

+87
-40
lines changed

4 files changed

+87
-40
lines changed

ydb/core/kqp/opt/kqp_statistics_transformer.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,11 @@ void InferStatisticsForKqpTable(const TExprNode::TPtr& input, TTypeAnnotationCon
102102
if (kqpCtx.Config->OverrideStatistics.Get()) {
103103
stats = OverrideStatistics(*stats, path.Value(), *kqpCtx.Config->OverrideStatistics.Get());
104104
}
105+
if (stats->ColumnStatistics) {
106+
for (const auto& [columnName, metaData]: tableData.Metadata->Columns) {
107+
stats->ColumnStatistics->Data[columnName].Type = metaData.Type;
108+
}
109+
}
105110

106111
YQL_CLOG(TRACE, CoreDq) << "Infer statistics for table: " << path.Value() << ", nrows: " << stats->Nrows << ", nattrs: " << stats->Ncols << ", byteSize: " << stats->ByteSize << ", nKeyColumns: " << stats->KeyColumns->Data.size();
107112

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"op_name":"InnerJoin (Grace)","args":[{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"TableFullScan","table":"partsupp"},{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"TableFullScan","table":"supplier"},{"op_name":"TableFullScan","table":"nation"}]}]},{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"TableFullScan","table":"partsupp"},{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"TableFullScan","table":"supplier"},{"op_name":"TableFullScan","table":"nation"}]}]},{"op_name":"TableFullScan","table":"part"}]}]}
1+
{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"TableFullScan","table":"partsupp"},{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"TableFullScan","table":"supplier"},{"op_name":"TableFullScan","table":"nation"}]}]},{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"TableFullScan","table":"partsupp"},{"op_name":"InnerJoin (MapJoin)","args":[{"op_name":"TableFullScan","table":"supplier"},{"op_name":"TableFullScan","table":"nation"}]}]},{"op_name":"TableFullScan","table":"part"}]}]}

ydb/library/yql/core/cbo/cbo_optimizer_new.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ TOptimizerStatistics TBaseProviderContext::ComputeJoinStats(
223223
} else {
224224
std::optional<double> lhsUniqueVals;
225225
std::optional<double> rhsUniqueVals;
226-
if (leftStats.ColumnStatistics && rightStats.ColumnStatistics) {
226+
if (leftStats.ColumnStatistics && rightStats.ColumnStatistics && !leftJoinKeys.empty() && !rightJoinKeys.empty()) {
227227
auto lhs = leftJoinKeys[0];
228228
lhsUniqueVals = leftStats.ColumnStatistics->Data[lhs].NumUniqueVals;
229229
auto rhs = rightJoinKeys[0];
@@ -232,7 +232,6 @@ TOptimizerStatistics TBaseProviderContext::ComputeJoinStats(
232232
}
233233

234234
if (lhsUniqueVals.has_value() && rhsUniqueVals.has_value()) {
235-
selectivity = std::max(*lhsUniqueVals, *rhsUniqueVals);
236235
newCard = leftStats.Nrows * rightStats.Nrows / std::max(*lhsUniqueVals, *rhsUniqueVals);
237236
} else {
238237
newCard = 0.2 * leftStats.Nrows * rightStats.Nrows;

ydb/library/yql/dq/opt/dq_opt_predicate_selectivity.cpp

Lines changed: 80 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ namespace {
3333
} else if (input.Ptr()->IsCallable("FromPg")) {
3434
auto child = TExprBase(input.Ptr()->ChildRef(0));
3535
return IsAttribute(child, attributeName);
36+
} else if (auto exists = input.Maybe<TCoExists>()) {
37+
auto child = TExprBase(input.Ptr()->ChildRef(0));
38+
return IsAttribute(child, attributeName);
3639
}
3740

3841
return false;
@@ -182,30 +185,47 @@ namespace {
182185
}
183186
}
184187

188+
template<typename T>
189+
TExprNode::TPtr FindNode(const TExprBase& input) {
190+
for (const auto& child : input.Ptr()->Children()) {
191+
if (TExprBase(child).Maybe<T>()) {
192+
return child;
193+
}
194+
195+
auto tmp = FindNode<T>(TExprBase(child));
196+
if (tmp != nullptr) {
197+
return tmp;
198+
}
199+
}
200+
201+
return nullptr;
202+
}
203+
185204
/**
186205
* Compute the selectivity of a predicate given statistics about the input it operates on
187206
*/
188207
double NYql::NDq::ComputePredicateSelectivity(const TExprBase& input, const std::shared_ptr<TOptimizerStatistics>& stats) {
189-
double result = 1.0;
208+
std::optional<double> resSelectivity;
190209

191210
// Process OptionalIf, just return the predicate statistics
192211
if (auto optIf = input.Maybe<TCoOptionalIf>()) {
193-
result = ComputePredicateSelectivity(optIf.Cast().Predicate(), stats);
212+
resSelectivity = ComputePredicateSelectivity(optIf.Cast().Predicate(), stats);
194213
}
195214

196215
// Same with Coalesce
197216
else if (auto coalesce = input.Maybe<TCoCoalesce>()) {
198-
result = ComputePredicateSelectivity(coalesce.Cast().Predicate(), stats);
217+
resSelectivity = ComputePredicateSelectivity(coalesce.Cast().Predicate(), stats);
199218
}
200219

201-
else if (input.Ptr()->IsCallable("FromPg")) {
220+
else if (
221+
input.Ptr()->IsCallable("FromPg") ||
222+
input.Ptr()->IsCallable("Exists") ||
223+
input.Ptr()->IsCallable("AssumeStrict") ||
224+
input.Ptr()->IsCallable("Apply") ||
225+
input.Ptr()->IsCallable("Udf")
226+
) {
202227
auto child = TExprBase(input.Ptr()->ChildRef(0));
203-
result = ComputePredicateSelectivity(child, stats);
204-
}
205-
206-
else if (input.Ptr()->IsCallable("Exists")) {
207-
auto child = TExprBase(input.Ptr()->ChildRef(0));
208-
result = ComputePredicateSelectivity(child, stats);
228+
resSelectivity = ComputePredicateSelectivity(child, stats);
209229
}
210230

211231
else if(input.Ptr()->IsCallable("Find") || input.Ptr()->IsCallable("StringContains")) {
@@ -214,7 +234,7 @@ double NYql::NDq::ComputePredicateSelectivity(const TExprBase& input, const std:
214234

215235
TString attributeName;
216236
if (IsAttribute(member, attributeName) && IsConstantExpr(stringPred.Ptr())) {
217-
result = 0.1;
237+
resSelectivity = 0.1;
218238
}
219239
}
220240

@@ -224,35 +244,35 @@ double NYql::NDq::ComputePredicateSelectivity(const TExprBase& input, const std:
224244
// In case of NOT we subtract the argument's selectivity from 1.0
225245

226246
else if (auto andNode = input.Maybe<TCoAnd>()) {
227-
double res = 1.0;
247+
double tmpSelectivity = 1.0;
228248
for (size_t i = 0; i < andNode.Cast().ArgCount(); i++) {
229-
res *= ComputePredicateSelectivity(andNode.Cast().Arg(i), stats);
249+
tmpSelectivity *= ComputePredicateSelectivity(andNode.Cast().Arg(i), stats);
230250
}
231-
result = res;
251+
resSelectivity = tmpSelectivity;
232252
} else if (auto orNode = input.Maybe<TCoOr>()) {
233-
double res = 0.0;
253+
double tmpSelectivity = 0.0;
234254
for (size_t i = 0; i < orNode.Cast().ArgCount(); i++) {
235-
res += ComputePredicateSelectivity(orNode.Cast().Arg(i), stats);
255+
tmpSelectivity += ComputePredicateSelectivity(orNode.Cast().Arg(i), stats);
236256
}
237-
result = std::max(res, 1.0);
257+
resSelectivity = tmpSelectivity;
238258
} else if (auto notNode = input.Maybe<TCoNot>()) {
239259
double argSel = ComputePredicateSelectivity(notNode.Cast().Value(), stats);
240-
result = 1.0 - (argSel == 1.0 ? 0.95 : argSel);
260+
resSelectivity = 1.0 - (argSel == 1.0 ? 0.95 : argSel);
241261
}
242262

243263
// Process the equality predicate
244264
else if (auto equality = input.Maybe<TCoCmpEqual>()) {
245265
auto left = equality.Cast().Left();
246266
auto right = equality.Cast().Right();
247267

248-
result = ComputeEqualitySelectivity(left, right, stats);
268+
resSelectivity = ComputeEqualitySelectivity(left, right, stats);
249269
}
250270

251271
else if (input.Ptr()->IsCallable("PgResolvedOp") && input.Ptr()->ChildPtr(0)->Content()=="=") {
252272
auto left = TExprBase(input.Ptr()->ChildPtr(2));
253273
auto right = TExprBase(input.Ptr()->ChildPtr(3));
254274

255-
result = ComputeEqualitySelectivity(left, right, stats);
275+
resSelectivity = ComputeEqualitySelectivity(left, right, stats);
256276
}
257277

258278
// Process the not equal predicate
@@ -261,55 +281,78 @@ double NYql::NDq::ComputePredicateSelectivity(const TExprBase& input, const std:
261281
auto right = equality.Cast().Right();
262282

263283
double eqSel = ComputeEqualitySelectivity(left, right, stats);
264-
result = 1.0 - (eqSel == 1.0 ? 0.95 : eqSel);
284+
resSelectivity = 1.0 - (eqSel == 1.0 ? 0.95 : eqSel);
265285
}
266286

267287
else if (input.Ptr()->IsCallable("PgResolvedOp") && input.Ptr()->ChildPtr(0)->Content()=="<>") {
268288
auto left = TExprBase(input.Ptr()->ChildPtr(2));
269289
auto right = TExprBase(input.Ptr()->ChildPtr(3));
270290

271291
double eqSel = ComputeEqualitySelectivity(left, right, stats);
272-
result = 1.0 - (eqSel == 1.0 ? 0.95 : eqSel);
292+
resSelectivity = 1.0 - (eqSel == 1.0 ? 0.95 : eqSel);
273293
}
274294

275295
// Process all other comparison predicates
276296
else if (auto comparison = input.Maybe<TCoCompare>()) {
277297
auto left = comparison.Cast().Left();
278298
auto right = comparison.Cast().Right();
279299

280-
result = ComputeComparisonSelectivity(left, right, stats);
300+
resSelectivity = ComputeComparisonSelectivity(left, right, stats);
281301
}
282302

283303
else if (input.Ptr()->IsCallable("PgResolvedOp") && PgInequalityPreds.contains(input.Ptr()->ChildPtr(0)->Content())){
284304
auto left = TExprBase(input.Ptr()->ChildPtr(2));
285305
auto right = TExprBase(input.Ptr()->ChildPtr(3));
286306

287-
result = ComputeComparisonSelectivity(left, right, stats);
307+
resSelectivity = ComputeComparisonSelectivity(left, right, stats);
288308
}
289309

290310
// Process SqlIn
291311
else if(input.Ptr()->IsCallable("SqlIn")) {
292-
auto left = TExprBase(input.Ptr()->ChildPtr(0));
293-
auto right = TExprBase(input.Ptr()->ChildPtr(1));
312+
auto list = input.Ptr()->ChildPtr(0);
294313

295-
TString attributeName;
314+
double tmpSelectivity = 0.0;
315+
auto lhs = TExprBase(input.Ptr()->ChildPtr(1));
316+
for (const auto& child: list->Children()) {
317+
TExprBase rhs = TExprBase(child);
318+
tmpSelectivity += ComputeEqualitySelectivity(lhs, rhs, stats);
319+
}
320+
resSelectivity = tmpSelectivity;
321+
}
296322

297-
if (IsAttribute(right, attributeName) && IsConstantExpr(left.Ptr())) {
298-
std::swap(left, right);
323+
else if (input.Maybe<TCoAtom>()) {
324+
auto atom = input.Cast<TCoAtom>();
325+
// regexp
326+
if (atom.StringValue().StartsWith("Re2")) {
327+
resSelectivity = 0.5;
299328
}
329+
}
300330

301-
if (IsAttribute(left, attributeName) && IsConstantExpr(right.Ptr())) {
302-
if (right.Ptr()->IsCallable("AsList")) {
303-
auto size = right.Ptr()->Child(0)->ChildrenSize();
304-
if (stats->KeyColumns && stats->KeyColumns->Data.size()==1 && attributeName==stats->KeyColumns->Data[0]) {
305-
result = size / stats->Nrows;
306-
} else {
307-
result = 0.1 + 0.2 / (1 + std::exp(size));
331+
else if (auto maybeIfExpr = input.Maybe<TCoIf>()) {
332+
auto ifExpr = maybeIfExpr.Cast();
333+
334+
// attr in ('a', 'b', 'c' ...)
335+
if (ifExpr.Predicate().Maybe<TCoExists>() && ifExpr.ThenValue().Maybe<TCoJust>() && ifExpr.ElseValue().Maybe<TCoNothing>()) {
336+
auto list = FindNode<TExprList>(ifExpr.ThenValue());
337+
338+
if (list != nullptr) {
339+
double tmpSelectivity = 0.0;
340+
TExprBase lhs = ifExpr.Predicate();
341+
for (const auto& child: list->Children()) {
342+
TExprBase rhs = TExprBase(child);
343+
tmpSelectivity += ComputeEqualitySelectivity(lhs, rhs, stats);
308344
}
309345

346+
resSelectivity = tmpSelectivity;
310347
}
311348
}
312349
}
313350

314-
return result;
351+
if (!resSelectivity.has_value()) {
352+
auto dumped = input.Raw()->Dump();
353+
YQL_CLOG(WARN, CoreDq) << "ComputePredicateSelectivity NOT FOUND : " << dumped;
354+
return 1.0;
355+
}
356+
357+
return std::min(1.0, resSelectivity.value());
315358
}

0 commit comments

Comments
 (0)