Skip to content

Commit b6e9c7d

Browse files
authored
Create vector index in SchemeShard (#4967)
1 parent a58932d commit b6e9c7d

35 files changed

+1211
-245
lines changed

ydb/core/base/table_index.cpp

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,24 @@ TVector<TString>::const_iterator IsUniq(const TVector<TString>& names) {
1313
return names.end();
1414
}
1515

16+
bool Contains(const TVector<TString>& names, TString str) {
17+
return std::find(names.begin(), names.end(), str) != names.end();
18+
}
19+
1620
namespace NKikimr {
1721
namespace NTableIndex {
1822

19-
TTableColumns CalcTableImplDescription(const TTableColumns& table, const TIndexColumns& index) {
20-
{
21-
TString explain;
22-
Y_ABORT_UNLESS(IsCompatibleIndex(table, index, explain), "explain is %s", explain.c_str());
23-
}
24-
23+
TTableColumns CalcTableImplDescription(const NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index) {
2524
TTableColumns result;
2625

27-
for (const auto& ik: index.KeyColumns) {
28-
result.Keys.push_back(ik);
29-
result.Columns.insert(ik);
26+
if (indexType == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree) {
27+
result.Keys.push_back(NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn);
28+
result.Columns.insert(NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn);
29+
} else {
30+
for (const auto& ik: index.KeyColumns) {
31+
result.Keys.push_back(ik);
32+
result.Columns.insert(ik);
33+
}
3034
}
3135

3236
for (const auto& tk: table.Keys) {
@@ -43,7 +47,9 @@ TTableColumns CalcTableImplDescription(const TTableColumns& table, const TIndexC
4347
return result;
4448
}
4549

46-
bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, TString& explain) {
50+
bool IsCompatibleIndex(const NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index, TString& explain) {
51+
const bool isVectorIndex = indexType == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree;
52+
4753
{
4854
auto brokenAt = IsUniq(table.Keys);
4955
if (brokenAt != table.Keys.end()) {
@@ -71,6 +77,23 @@ bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, T
7177
}
7278
}
7379

80+
if (isVectorIndex) {
81+
if (index.KeyColumns.size() != 1) {
82+
explain = "Only single key column is supported for vector index";
83+
return false;
84+
}
85+
86+
if (Contains(index.KeyColumns, NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn)) {
87+
explain = TStringBuilder() << "Key column should not have a reserved name: " << NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn;
88+
return false;
89+
}
90+
91+
if (Contains(index.DataColumns, NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn)) {
92+
explain = TStringBuilder() << "Data column should not have a reserved name: " << NTableVectorKmeansTreeIndex::PostingTable_ParentIdColumn;
93+
return false;
94+
}
95+
}
96+
7497
THashSet<TString> indexKeys;
7598

7699
for (const auto& tableKeyName: table.Keys) {
@@ -84,7 +107,8 @@ bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, T
84107
}
85108

86109
for (const auto& indexKeyName: index.KeyColumns) {
87-
indexKeys.insert(indexKeyName);
110+
if (!isVectorIndex)
111+
indexKeys.insert(indexKeyName);
88112
if (!table.Columns.contains(indexKeyName)) {
89113
explain = TStringBuilder()
90114
<< "all index keys should be in table columns"
@@ -93,9 +117,9 @@ bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, T
93117
}
94118
}
95119

96-
if (index.KeyColumns == table.Keys) {
120+
if (index.KeyColumns == table.Keys && !isVectorIndex) {
97121
explain = TStringBuilder()
98-
<< "table and index keys are the same";
122+
<< "table and index keys are the same";
99123
return false;
100124
}
101125

ydb/core/base/table_index.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#pragma once
22

3+
#include "table_vector_index.h"
4+
#include <ydb/core/protos/flat_scheme_op.pb.h>
5+
36
#include <util/generic/hash_set.h>
47
#include <util/generic/vector.h>
58
#include <util/generic/string.h>
@@ -18,8 +21,8 @@ struct TIndexColumns {
1821
TVector<TString> DataColumns;
1922
};
2023

21-
bool IsCompatibleIndex(const TTableColumns& table, const TIndexColumns& index, TString& explain);
22-
TTableColumns CalcTableImplDescription(const TTableColumns& table, const TIndexColumns &index);
24+
bool IsCompatibleIndex(const NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index, TString& explain);
25+
TTableColumns CalcTableImplDescription(const NKikimrSchemeOp::EIndexType indexType, const TTableColumns& table, const TIndexColumns& index);
2326

2427
}
2528
}

ydb/core/base/table_vector_index.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#pragma once
2+
3+
namespace NKikimr::NTableIndex::NTableVectorKmeansTreeIndex {
4+
5+
// Vector KmeansTree index tables description
6+
7+
// Levels table
8+
inline constexpr const char* LevelTable = "indexImplLevelTable";
9+
inline constexpr const char* LevelTable_ParentIdColumn = "-parent";
10+
inline constexpr const char* LevelTable_IdColumn = "-id";
11+
inline constexpr const char* LevelTable_EmbeddingColumn = "-embedding";
12+
13+
// Posting table
14+
inline constexpr const char* PostingTable = "indexImplPostingTable";
15+
inline constexpr const char* PostingTable_ParentIdColumn = "-parent";
16+
17+
18+
}
19+

ydb/core/kqp/host/kqp_gateway_proxy.cpp

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -572,18 +572,7 @@ class TKqpGatewayProxy : public IKikimrGateway {
572572
for (const auto& index : metadata->Indexes) {
573573
auto indexDesc = schemeTx.MutableCreateIndexedTable()->AddIndexDescription();
574574
indexDesc->SetName(index.Name);
575-
switch (index.Type) {
576-
case NYql::TIndexDescription::EType::GlobalSync:
577-
indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobal);
578-
break;
579-
case NYql::TIndexDescription::EType::GlobalAsync:
580-
indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync);
581-
break;
582-
case NYql::TIndexDescription::EType::GlobalSyncUnique:
583-
indexDesc->SetType(NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique);
584-
break;
585-
}
586-
575+
indexDesc->SetType(TIndexDescription::ConvertIndexType(index.Type));
587576
indexDesc->SetState(static_cast<::NKikimrSchemeOp::EIndexState>(index.State));
588577
for (const auto& col : index.KeyColumns) {
589578
indexDesc->AddKeyColumnNames(col);

ydb/core/kqp/provider/yql_kikimr_gateway.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,14 @@ void IKikimrGateway::BuildIndexMetadata(TTableMetadataResult& loadTableMetadataR
115115
for (size_t i = 0; i < indexesCount; i++) {
116116
const auto& index = tableMetadata->Indexes[i];
117117
auto indexTablePath = NKikimr::NKqp::NSchemeHelpers::CreateIndexTablePath(tableName, index.Name);
118-
NKikimr::NTableIndex::TTableColumns indexTableColumns = NKikimr::NTableIndex::CalcTableImplDescription(
119-
tableColumns,
120-
NKikimr::NTableIndex::TIndexColumns{index.KeyColumns, {}});
118+
119+
NKikimr::NTableIndex::TIndexColumns indexColumns{index.KeyColumns, {}};
120+
121+
TString error;
122+
NKikimrSchemeOp::EIndexType indexType = TIndexDescription::ConvertIndexType(index.Type);
123+
YQL_ENSURE(IsCompatibleIndex(indexType, tableColumns, indexColumns, error), "Index is not compatible: " << error);
124+
125+
NKikimr::NTableIndex::TTableColumns indexTableColumns = NKikimr::NTableIndex::CalcTableImplDescription(indexType, tableColumns, indexColumns);
121126

122127
TKikimrTableMetadataPtr indexTableMetadata = new TKikimrTableMetadata(cluster, indexTablePath);
123128
indexTableMetadata->DoesExist = true;

ydb/core/kqp/provider/yql_kikimr_gateway.h

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ struct TIndexDescription {
6464
GlobalSync = 0,
6565
GlobalAsync = 1,
6666
GlobalSyncUnique = 2,
67+
GlobalSyncVectorKMeansTree = 3
6768
};
6869

6970
// Index states here must be in sync with NKikimrSchemeOp::EIndexState protobuf
@@ -99,7 +100,7 @@ struct TIndexDescription {
99100
: Name(index.GetName())
100101
, KeyColumns(index.GetKeyColumnNames().begin(), index.GetKeyColumnNames().end())
101102
, DataColumns(index.GetDataColumnNames().begin(), index.GetDataColumnNames().end())
102-
, Type(ConvertIndexType(index))
103+
, Type(ConvertIndexType(index.GetType()))
103104
, State(static_cast<EIndexState>(index.GetState()))
104105
, SchemaVersion(index.GetSchemaVersion())
105106
, LocalPathId(index.GetLocalPathId())
@@ -117,15 +118,32 @@ struct TIndexDescription {
117118
, PathOwnerId(message->GetPathOwnerId())
118119
{}
119120

120-
static TIndexDescription::EType ConvertIndexType(const NKikimrSchemeOp::TIndexDescription& index) {
121-
auto type = NYql::TIndexDescription::EType::GlobalSync;
122-
if (index.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync) {
123-
type = NYql::TIndexDescription::EType::GlobalAsync;
124-
} else if (index.GetType() == NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique) {
125-
type = NYql::TIndexDescription::EType::GlobalSyncUnique;
121+
static TIndexDescription::EType ConvertIndexType(const NKikimrSchemeOp::EIndexType indexType) {
122+
switch (indexType) {
123+
case NKikimrSchemeOp::EIndexType::EIndexTypeGlobal:
124+
return TIndexDescription::EType::GlobalSync;
125+
case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync:
126+
return TIndexDescription::EType::GlobalAsync;
127+
case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique:
128+
return TIndexDescription::EType::GlobalSyncUnique;
129+
case NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree:
130+
return TIndexDescription::EType::GlobalSyncVectorKMeansTree;
131+
default:
132+
YQL_ENSURE(false, "Unexpected NKikimrSchemeOp::EIndexType::EIndexTypeInvalid");
126133
}
134+
}
127135

128-
return type;
136+
static NKikimrSchemeOp::EIndexType ConvertIndexType(const TIndexDescription::EType indexType) {
137+
switch (indexType) {
138+
case TIndexDescription::EType::GlobalSync:
139+
return NKikimrSchemeOp::EIndexType::EIndexTypeGlobal;
140+
case TIndexDescription::EType::GlobalAsync:
141+
return NKikimrSchemeOp::EIndexType::EIndexTypeGlobalAsync;
142+
case TIndexDescription::EType::GlobalSyncUnique:
143+
return NKikimrSchemeOp::EIndexType::EIndexTypeGlobalUnique;
144+
case NYql::TIndexDescription::EType::GlobalSyncVectorKMeansTree:
145+
return NKikimrSchemeOp::EIndexType::EIndexTypeGlobalVectorKmeansTree;
146+
}
129147
}
130148

131149
void ToMessage(NKikimrKqp::TIndexDescriptionProto* message) const {
@@ -160,6 +178,8 @@ struct TIndexDescription {
160178
return true;
161179
case EType::GlobalAsync:
162180
return false;
181+
case EType::GlobalSyncVectorKMeansTree:
182+
return true;
163183
}
164184
}
165185
};

ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2430,6 +2430,21 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
24302430
auto indexDesc = describe.GetTableDescription();
24312431
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetPartitioningSettings().GetMinPartitionsCount(), minPartitionsCount);
24322432
}
2433+
2434+
constexpr int partitionSizeMb = 555;
2435+
{
2436+
auto result = session.ExecuteSchemeQuery(Sprintf(R"(
2437+
ALTER TABLE `/Root/SecondaryKeys` ALTER INDEX Index SET AUTO_PARTITIONING_PARTITION_SIZE_MB %d;
2438+
)", partitionSizeMb)
2439+
).ExtractValueSync();
2440+
UNIT_ASSERT_C(result.IsSuccess(), result.GetIssues().ToString());
2441+
}
2442+
{
2443+
auto describe = session.DescribeTable("/Root/SecondaryKeys/Index/indexImplTable").GetValueSync();
2444+
UNIT_ASSERT_C(describe.IsSuccess(), describe.GetIssues().ToString());
2445+
auto indexDesc = describe.GetTableDescription();
2446+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetPartitioningSettings().GetPartitionSizeMb(), partitionSizeMb);
2447+
}
24332448
}
24342449

24352450
Y_UNIT_TEST(AlterIndexImplTable) {
@@ -2635,6 +2650,76 @@ Y_UNIT_TEST_SUITE(KqpScheme) {
26352650
}
26362651
}
26372652

2653+
Y_UNIT_TEST(CreateTableWithVectorIndexPublicApi) {
2654+
TKikimrRunner kikimr;
2655+
auto db = kikimr.GetTableClient();
2656+
auto session = db.CreateSession().GetValueSync().GetSession();
2657+
{
2658+
auto builder = TTableBuilder()
2659+
.AddNullableColumn("Key", EPrimitiveType::Uint64)
2660+
.AddNullableColumn("Embedding", EPrimitiveType::String)
2661+
.SetPrimaryKeyColumn("Key")
2662+
.AddVectorKMeansTreeSecondaryIndex("vector_idx", {"Embedding"},
2663+
{ NYdb::NTable::TVectorIndexSettings::EDistance::Cosine,
2664+
NYdb::NTable::TVectorIndexSettings::EVectorType::Float,
2665+
1024});
2666+
2667+
auto result = session.CreateTable("/Root/TestTable", builder.Build()).ExtractValueSync();
2668+
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
2669+
}
2670+
{
2671+
auto result = session.DescribeTable("/Root/TestTable").ExtractValueSync();
2672+
UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), NYdb::EStatus::SUCCESS);
2673+
2674+
UNIT_ASSERT_VALUES_EQUAL(result.GetTableDescription().GetIndexDescriptions().size(), 1);
2675+
auto indexDesc = result.GetTableDescription().GetIndexDescriptions()[0];
2676+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexName(), "vector_idx");
2677+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexType(), EIndexType::GlobalVectorKMeansTree);
2678+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexColumns().size(), 1);
2679+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexColumns()[0], "Embedding");
2680+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetDataColumns().size(), 0);
2681+
UNIT_ASSERT_VALUES_EQUAL(std::get<NYdb::NTable::TVectorIndexSettings::EDistance>(indexDesc.GetVectorIndexSettings()->Metric), NYdb::NTable::TVectorIndexSettings::EDistance::Cosine);
2682+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetVectorIndexSettings()->VectorType, NYdb::NTable::TVectorIndexSettings::EVectorType::Float);
2683+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetVectorIndexSettings()->VectorDimension, 1024);
2684+
}
2685+
}
2686+
2687+
Y_UNIT_TEST(CreateTableWithVectorIndexCoveredPublicApi) {
2688+
TKikimrRunner kikimr;
2689+
auto db = kikimr.GetTableClient();
2690+
auto session = db.CreateSession().GetValueSync().GetSession();
2691+
{
2692+
auto builder = TTableBuilder()
2693+
.AddNullableColumn("Key", EPrimitiveType::Uint64)
2694+
.AddNullableColumn("Embedding", EPrimitiveType::String)
2695+
.AddNullableColumn("Covered", EPrimitiveType::String)
2696+
.SetPrimaryKeyColumn("Key")
2697+
.AddVectorKMeansTreeSecondaryIndex("vector_idx", {"Embedding"}, {"Covered"},
2698+
{ NYdb::NTable::TVectorIndexSettings::EDistance::Cosine,
2699+
NYdb::NTable::TVectorIndexSettings::EVectorType::Float,
2700+
1024});
2701+
2702+
auto result = session.CreateTable("/Root/TestTable", builder.Build()).ExtractValueSync();
2703+
UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString());
2704+
}
2705+
{
2706+
auto result = session.DescribeTable("/Root/TestTable").ExtractValueSync();
2707+
UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), NYdb::EStatus::SUCCESS);
2708+
2709+
UNIT_ASSERT_VALUES_EQUAL(result.GetTableDescription().GetIndexDescriptions().size(), 1);
2710+
auto indexDesc = result.GetTableDescription().GetIndexDescriptions()[0];
2711+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexName(), "vector_idx");
2712+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexType(), EIndexType::GlobalVectorKMeansTree);
2713+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexColumns().size(), 1);
2714+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetIndexColumns()[0], "Embedding");
2715+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetDataColumns().size(), 1);
2716+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetDataColumns()[0], "Covered");
2717+
UNIT_ASSERT_VALUES_EQUAL(std::get<NYdb::NTable::TVectorIndexSettings::EDistance>(indexDesc.GetVectorIndexSettings()->Metric), NYdb::NTable::TVectorIndexSettings::EDistance::Cosine);
2718+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetVectorIndexSettings()->VectorType, NYdb::NTable::TVectorIndexSettings::EVectorType::Float);
2719+
UNIT_ASSERT_VALUES_EQUAL(indexDesc.GetVectorIndexSettings()->VectorDimension, 1024);
2720+
}
2721+
}
2722+
26382723
Y_UNIT_TEST(AlterTableWithDecimalColumn) {
26392724
TKikimrRunner kikimr;
26402725
auto db = kikimr.GetTableClient();

ydb/core/protos/flat_scheme_op.proto

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import "ydb/core/protos/follower_group.proto";
1313
import "ydb/core/protos/blob_depot_config.proto";
1414
import "ydb/public/api/protos/ydb_coordination.proto";
1515
import "ydb/public/api/protos/ydb_export.proto";
16+
import "ydb/public/api/protos/ydb_table.proto";
1617
import "ydb/public/api/protos/ydb_value.proto";
1718
import "ydb/library/actors/protos/actors.proto";
1819
import "ydb/library/mkql_proto/protos/minikql.proto";
@@ -973,6 +974,7 @@ enum EIndexType {
973974
EIndexTypeGlobal = 1;
974975
EIndexTypeGlobalAsync = 2;
975976
EIndexTypeGlobalUnique = 3;
977+
EIndexTypeGlobalVectorKmeansTree = 4;
976978
}
977979

978980
enum EIndexState {
@@ -982,6 +984,10 @@ enum EIndexState {
982984
EIndexStateWriteOnly = 3;
983985
}
984986

987+
message TVectorIndexKmeansTreeDescription {
988+
optional Ydb.Table.VectorIndexSettings Settings = 1;
989+
}
990+
985991
message TIndexDescription {
986992
optional string Name = 1;
987993
optional uint64 LocalPathId = 2;
@@ -1000,6 +1006,10 @@ message TIndexDescription {
10001006
// DataSize + IndexSize of indexImplTable
10011007
optional uint64 DataSize = 9;
10021008
repeated TTableDescription IndexImplTableDescriptions = 10;
1009+
1010+
oneof SpecializedIndexDescription {
1011+
TVectorIndexKmeansTreeDescription VectorIndexKmeansTreeDescription = 11;
1012+
}
10031013
}
10041014

10051015
message TIndexCreationConfig {
@@ -1009,6 +1019,9 @@ message TIndexCreationConfig {
10091019
repeated TTableDescription IndexImplTableDescriptions = 4; //description for index impl tables
10101020
optional EIndexState State = 5; //state of index at the creation time
10111021
repeated string DataColumnNames = 6; //columns to be denormalized to read data just from index
1022+
oneof SpecializedIndexDescription {
1023+
TVectorIndexKmeansTreeDescription VectorIndexKmeansTreeDescription = 7;
1024+
}
10121025
}
10131026

10141027
message TIndexAlteringConfig {
@@ -1849,6 +1862,7 @@ enum EPathSubType {
18491862
EPathSubTypeSyncIndexImplTable = 1;
18501863
EPathSubTypeAsyncIndexImplTable = 2;
18511864
EPathSubTypeStreamImpl = 3;
1865+
EPathSubTypeVectorKmeansTreeIndexImplTable = 4;
18521866
}
18531867

18541868
enum EPathState {

ydb/core/tx/datashard/datashard_user_table.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#pragma once
22

33
#include <ydb/core/base/storage_pools.h>
4+
#include <ydb/core/base/table_vector_index.h>
45
#include <ydb/core/scheme/scheme_tabledefs.h>
56
#include <ydb/core/tablet_flat/flat_database.h>
67
#include <ydb/core/tablet_flat/flat_stat_table.h>

0 commit comments

Comments
 (0)