Skip to content

Commit 6f0a4d9

Browse files
split library (ydb-platform#2613)
1 parent bb06266 commit 6f0a4d9

16 files changed

+524
-409
lines changed
Lines changed: 0 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -1,141 +1 @@
11
#include "abstract_scheme.h"
2-
3-
#include <ydb/core/tx/columnshard/engines/index_info.h>
4-
#include <ydb/core/formats/arrow/arrow_helpers.h>
5-
#include <util/string/join.h>
6-
7-
namespace NKikimr::NOlap {
8-
9-
std::shared_ptr<arrow::Field> ISnapshotSchema::GetFieldByIndex(const int index) const {
10-
auto schema = GetSchema();
11-
if (!schema || index < 0 || index >= schema->num_fields()) {
12-
return nullptr;
13-
}
14-
return schema->field(index);
15-
}
16-
std::shared_ptr<arrow::Field> ISnapshotSchema::GetFieldByColumnIdOptional(const ui32 columnId) const {
17-
return GetFieldByIndex(GetFieldIndex(columnId));
18-
}
19-
20-
std::set<ui32> ISnapshotSchema::GetPkColumnsIds() const {
21-
std::set<ui32> result;
22-
for (auto&& field : GetIndexInfo().GetReplaceKey()->fields()) {
23-
result.emplace(GetColumnId(field->name()));
24-
}
25-
return result;
26-
27-
}
28-
29-
std::shared_ptr<arrow::RecordBatch> ISnapshotSchema::NormalizeBatch(const ISnapshotSchema& dataSchema, const std::shared_ptr<arrow::RecordBatch> batch) const {
30-
if (dataSchema.GetSnapshot() == GetSnapshot()) {
31-
return batch;
32-
}
33-
Y_ABORT_UNLESS(dataSchema.GetSnapshot() < GetSnapshot());
34-
const std::shared_ptr<arrow::Schema>& resultArrowSchema = GetSchema();
35-
std::vector<std::shared_ptr<arrow::Array>> newColumns;
36-
newColumns.reserve(resultArrowSchema->num_fields());
37-
38-
for (size_t i = 0; i < resultArrowSchema->fields().size(); ++i) {
39-
auto& resultField = resultArrowSchema->fields()[i];
40-
auto columnId = GetIndexInfo().GetColumnId(resultField->name());
41-
auto oldColumnIndex = dataSchema.GetFieldIndex(columnId);
42-
if (oldColumnIndex >= 0) { // ColumnExists
43-
auto oldColumnInfo = dataSchema.GetFieldByIndex(oldColumnIndex);
44-
Y_ABORT_UNLESS(oldColumnInfo);
45-
auto columnData = batch->GetColumnByName(oldColumnInfo->name());
46-
Y_ABORT_UNLESS(columnData);
47-
newColumns.push_back(columnData);
48-
} else { // AddNullColumn
49-
auto nullColumn = NArrow::MakeEmptyBatch(arrow::schema({resultField}), batch->num_rows());
50-
newColumns.push_back(nullColumn->column(0));
51-
}
52-
}
53-
return arrow::RecordBatch::Make(resultArrowSchema, batch->num_rows(), newColumns);
54-
}
55-
56-
std::shared_ptr<arrow::RecordBatch> ISnapshotSchema::PrepareForInsert(const TString& data, const std::shared_ptr<arrow::Schema>& dataSchema) const {
57-
std::shared_ptr<arrow::Schema> dstSchema = GetIndexInfo().ArrowSchema();
58-
auto batch = NArrow::DeserializeBatch(data, (dataSchema ? dataSchema : dstSchema));
59-
if (!batch) {
60-
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("error", "DeserializeBatch() failed");
61-
return nullptr;
62-
}
63-
if (batch->num_rows() == 0) {
64-
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("error", "empty batch");
65-
return nullptr;
66-
}
67-
68-
// Correct schema
69-
if (dataSchema) {
70-
batch = NArrow::ExtractColumns(batch, dstSchema, true);
71-
if (!batch) {
72-
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("error", "cannot correct schema");
73-
return nullptr;
74-
}
75-
}
76-
77-
if (!batch->schema()->Equals(dstSchema)) {
78-
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("error", TStringBuilder() << "unexpected schema for insert batch: '" << batch->schema()->ToString() << "'");
79-
return nullptr;
80-
}
81-
82-
const auto& sortingKey = GetIndexInfo().GetPrimaryKey();
83-
Y_ABORT_UNLESS(sortingKey);
84-
85-
// Check PK is NOT NULL
86-
for (auto& field : sortingKey->fields()) {
87-
auto column = batch->GetColumnByName(field->name());
88-
if (!column) {
89-
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("error", TStringBuilder() << "missing PK column '" << field->name() << "'");
90-
return nullptr;
91-
}
92-
if (NArrow::HasNulls(column)) {
93-
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("error", TStringBuilder() << "PK column '" << field->name() << "' contains NULLs");
94-
return nullptr;
95-
}
96-
}
97-
98-
auto status = batch->ValidateFull();
99-
if (!status.ok()) {
100-
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("error", status.ToString());
101-
return nullptr;
102-
}
103-
batch = NArrow::SortBatch(batch, sortingKey, true);
104-
Y_DEBUG_ABORT_UNLESS(NArrow::IsSortedAndUnique(batch, sortingKey));
105-
return batch;
106-
}
107-
108-
ui32 ISnapshotSchema::GetColumnId(const std::string& columnName) const {
109-
auto id = GetColumnIdOptional(columnName);
110-
AFL_VERIFY(id)("column_name", columnName)("schema", JoinSeq(",", GetSchema()->field_names()));
111-
return *id;
112-
}
113-
114-
std::shared_ptr<arrow::Field> ISnapshotSchema::GetFieldByColumnIdVerified(const ui32 columnId) const {
115-
auto result = GetFieldByColumnIdOptional(columnId);
116-
AFL_VERIFY(result)("event", "unknown_column")("column_id", columnId)("schema", DebugString());
117-
return result;
118-
}
119-
120-
std::shared_ptr<NKikimr::NOlap::TColumnLoader> ISnapshotSchema::GetColumnLoaderVerified(const ui32 columnId) const {
121-
auto result = GetColumnLoaderOptional(columnId);
122-
AFL_VERIFY(result);
123-
return result;
124-
}
125-
126-
std::shared_ptr<NKikimr::NOlap::TColumnLoader> ISnapshotSchema::GetColumnLoaderVerified(const std::string& columnName) const {
127-
auto result = GetColumnLoaderOptional(columnName);
128-
AFL_VERIFY(result);
129-
return result;
130-
}
131-
132-
std::shared_ptr<NKikimr::NOlap::TColumnLoader> ISnapshotSchema::GetColumnLoaderOptional(const std::string& columnName) const {
133-
const std::optional<ui32> id = GetColumnIdOptional(columnName);
134-
if (id) {
135-
return GetColumnLoaderOptional(*id);
136-
} else {
137-
return nullptr;
138-
}
139-
}
140-
141-
}
Lines changed: 1 addition & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,2 @@
11
#pragma once
2-
#include "abstract/saver.h"
3-
#include "abstract/loader.h"
4-
5-
#include <ydb/core/tx/columnshard/common/snapshot.h>
6-
7-
#include <string>
8-
9-
namespace NKikimr::NOlap {
10-
11-
struct TIndexInfo;
12-
class TSaverContext;
13-
14-
class ISnapshotSchema {
15-
protected:
16-
virtual TString DoDebugString() const = 0;
17-
public:
18-
using TPtr = std::shared_ptr<ISnapshotSchema>;
19-
20-
virtual ~ISnapshotSchema() {}
21-
virtual std::shared_ptr<TColumnLoader> GetColumnLoaderOptional(const ui32 columnId) const = 0;
22-
std::shared_ptr<TColumnLoader> GetColumnLoaderVerified(const ui32 columnId) const;
23-
std::shared_ptr<TColumnLoader> GetColumnLoaderOptional(const std::string& columnName) const;
24-
std::shared_ptr<TColumnLoader> GetColumnLoaderVerified(const std::string& columnName) const;
25-
26-
virtual TColumnSaver GetColumnSaver(const ui32 columnId, const TSaverContext& context) const = 0;
27-
TColumnSaver GetColumnSaver(const TString& columnName, const TSaverContext& context) const {
28-
return GetColumnSaver(GetColumnId(columnName), context);
29-
}
30-
TColumnSaver GetColumnSaver(const std::string& columnName, const TSaverContext& context) const {
31-
return GetColumnSaver(TString(columnName.data(), columnName.size()), context);
32-
}
33-
34-
virtual std::optional<ui32> GetColumnIdOptional(const std::string& columnName) const = 0;
35-
virtual int GetFieldIndex(const ui32 columnId) const = 0;
36-
37-
ui32 GetColumnId(const std::string& columnName) const;
38-
std::shared_ptr<arrow::Field> GetFieldByIndex(const int index) const;
39-
std::shared_ptr<arrow::Field> GetFieldByColumnIdOptional(const ui32 columnId) const;
40-
std::shared_ptr<arrow::Field> GetFieldByColumnIdVerified(const ui32 columnId) const;
41-
42-
TString DebugString() const {
43-
return DoDebugString();
44-
}
45-
virtual const std::shared_ptr<arrow::Schema>& GetSchema() const = 0;
46-
virtual const TIndexInfo& GetIndexInfo() const = 0;
47-
virtual const TSnapshot& GetSnapshot() const = 0;
48-
virtual ui64 GetVersion() const = 0;
49-
virtual ui32 GetColumnsCount() const = 0;
50-
51-
std::set<ui32> GetPkColumnsIds() const;
52-
53-
std::shared_ptr<arrow::RecordBatch> NormalizeBatch(const ISnapshotSchema& dataSchema, const std::shared_ptr<arrow::RecordBatch> batch) const;
54-
std::shared_ptr<arrow::RecordBatch> PrepareForInsert(const TString& data, const std::shared_ptr<arrow::Schema>& dataSchema) const;
55-
};
56-
57-
} // namespace NKikimr::NOlap
2+
#include "versions/abstract_scheme.h"
Lines changed: 0 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,94 +1 @@
11
#include "filtered_scheme.h"
2-
#include <util/string/join.h>
3-
4-
5-
namespace NKikimr::NOlap {
6-
7-
TFilteredSnapshotSchema::TFilteredSnapshotSchema(ISnapshotSchema::TPtr originalSnapshot, const std::vector<ui32>& columnIds)
8-
: TFilteredSnapshotSchema(originalSnapshot, std::set(columnIds.begin(), columnIds.end()))
9-
{}
10-
11-
TFilteredSnapshotSchema::TFilteredSnapshotSchema(ISnapshotSchema::TPtr originalSnapshot, const std::set<ui32>& columnIds)
12-
: OriginalSnapshot(originalSnapshot)
13-
, ColumnIds(columnIds)
14-
{
15-
std::vector<std::shared_ptr<arrow::Field>> schemaFields;
16-
for (auto&& i : OriginalSnapshot->GetSchema()->fields()) {
17-
if (!ColumnIds.contains(OriginalSnapshot->GetIndexInfo().GetColumnId(i->name()))) {
18-
continue;
19-
}
20-
schemaFields.emplace_back(i);
21-
}
22-
Schema = std::make_shared<arrow::Schema>(schemaFields);
23-
}
24-
25-
TFilteredSnapshotSchema::TFilteredSnapshotSchema(ISnapshotSchema::TPtr originalSnapshot, const std::set<std::string>& columnNames)
26-
: OriginalSnapshot(originalSnapshot) {
27-
for (auto&& i : columnNames) {
28-
ColumnIds.emplace(OriginalSnapshot->GetColumnId(i));
29-
}
30-
std::vector<std::shared_ptr<arrow::Field>> schemaFields;
31-
for (auto&& i : OriginalSnapshot->GetSchema()->fields()) {
32-
if (!columnNames.contains(i->name())) {
33-
continue;
34-
}
35-
schemaFields.emplace_back(i);
36-
}
37-
Schema = std::make_shared<arrow::Schema>(schemaFields);
38-
}
39-
40-
TColumnSaver TFilteredSnapshotSchema::GetColumnSaver(const ui32 columnId, const TSaverContext& context) const {
41-
Y_ABORT_UNLESS(ColumnIds.contains(columnId));
42-
return OriginalSnapshot->GetColumnSaver(columnId, context);
43-
}
44-
45-
std::shared_ptr<TColumnLoader> TFilteredSnapshotSchema::GetColumnLoaderOptional(const ui32 columnId) const {
46-
Y_ABORT_UNLESS(ColumnIds.contains(columnId));
47-
return OriginalSnapshot->GetColumnLoaderOptional(columnId);
48-
}
49-
50-
std::optional<ui32> TFilteredSnapshotSchema::GetColumnIdOptional(const std::string& columnName) const {
51-
return OriginalSnapshot->GetColumnIdOptional(columnName);
52-
}
53-
54-
int TFilteredSnapshotSchema::GetFieldIndex(const ui32 columnId) const {
55-
if (!ColumnIds.contains(columnId)) {
56-
return -1;
57-
}
58-
TString columnName = OriginalSnapshot->GetIndexInfo().GetColumnName(columnId, false);
59-
if (!columnName) {
60-
return -1;
61-
}
62-
std::string name(columnName.data(), columnName.size());
63-
return Schema->GetFieldIndex(name);
64-
}
65-
66-
const std::shared_ptr<arrow::Schema>& TFilteredSnapshotSchema::GetSchema() const {
67-
return Schema;
68-
}
69-
70-
const TIndexInfo& TFilteredSnapshotSchema::GetIndexInfo() const {
71-
return OriginalSnapshot->GetIndexInfo();
72-
}
73-
74-
const TSnapshot& TFilteredSnapshotSchema::GetSnapshot() const {
75-
return OriginalSnapshot->GetSnapshot();
76-
}
77-
78-
ui32 TFilteredSnapshotSchema::GetColumnsCount() const {
79-
return Schema->num_fields();
80-
}
81-
82-
ui64 TFilteredSnapshotSchema::GetVersion() const {
83-
return OriginalSnapshot->GetIndexInfo().GetVersion();
84-
}
85-
86-
TString TFilteredSnapshotSchema::DoDebugString() const {
87-
return TStringBuilder() << "("
88-
<< "original=" << OriginalSnapshot->DebugString() << ";"
89-
<< "column_ids=[" << JoinSeq(",", ColumnIds) << "];"
90-
<< ")"
91-
;
92-
}
93-
94-
}
Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,2 @@
11
#pragma once
2-
3-
#include "abstract_scheme.h"
4-
5-
#include <ydb/core/tx/columnshard/engines/index_info.h>
6-
7-
namespace NKikimr::NOlap {
8-
9-
class TFilteredSnapshotSchema: public ISnapshotSchema {
10-
ISnapshotSchema::TPtr OriginalSnapshot;
11-
std::shared_ptr<arrow::Schema> Schema;
12-
std::set<ui32> ColumnIds;
13-
protected:
14-
virtual TString DoDebugString() const override;
15-
public:
16-
TFilteredSnapshotSchema(ISnapshotSchema::TPtr originalSnapshot, const std::vector<ui32>& columnIds);
17-
TFilteredSnapshotSchema(ISnapshotSchema::TPtr originalSnapshot, const std::set<ui32>& columnIds);
18-
TFilteredSnapshotSchema(ISnapshotSchema::TPtr originalSnapshot, const std::set<std::string>& columnNames);
19-
20-
TColumnSaver GetColumnSaver(const ui32 columnId, const TSaverContext& context) const override;
21-
std::shared_ptr<TColumnLoader> GetColumnLoaderOptional(const ui32 columnId) const override;
22-
std::optional<ui32> GetColumnIdOptional(const std::string& columnName) const override;
23-
int GetFieldIndex(const ui32 columnId) const override;
24-
25-
const std::shared_ptr<arrow::Schema>& GetSchema() const override;
26-
const TIndexInfo& GetIndexInfo() const override;
27-
const TSnapshot& GetSnapshot() const override;
28-
ui32 GetColumnsCount() const override;
29-
ui64 GetVersion() const override;
30-
};
31-
32-
}
2+
#include "versions/filtered_scheme.h"
Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1 @@
11
#include "snapshot_scheme.h"
2-
3-
namespace NKikimr::NOlap {
4-
5-
TSnapshotSchema::TSnapshotSchema(TIndexInfo&& indexInfo, const TSnapshot& snapshot)
6-
: IndexInfo(std::move(indexInfo))
7-
, Schema(IndexInfo.ArrowSchemaWithSpecials())
8-
, Snapshot(snapshot)
9-
{
10-
}
11-
12-
TColumnSaver TSnapshotSchema::GetColumnSaver(const ui32 columnId, const TSaverContext& context) const {
13-
return IndexInfo.GetColumnSaver(columnId, context);
14-
}
15-
16-
std::shared_ptr<TColumnLoader> TSnapshotSchema::GetColumnLoaderOptional(const ui32 columnId) const {
17-
return IndexInfo.GetColumnLoaderOptional(columnId);
18-
}
19-
20-
std::optional<ui32> TSnapshotSchema::GetColumnIdOptional(const std::string& columnName) const {
21-
return IndexInfo.GetColumnIdOptional(columnName);
22-
}
23-
24-
int TSnapshotSchema::GetFieldIndex(const ui32 columnId) const {
25-
const TString& columnName = IndexInfo.GetColumnName(columnId, false);
26-
if (!columnName) {
27-
return -1;
28-
}
29-
std::string name(columnName.data(), columnName.size());
30-
return Schema->GetFieldIndex(name);
31-
}
32-
33-
const std::shared_ptr<arrow::Schema>& TSnapshotSchema::GetSchema() const {
34-
return Schema;
35-
}
36-
37-
const TIndexInfo& TSnapshotSchema::GetIndexInfo() const {
38-
return IndexInfo;
39-
}
40-
41-
const TSnapshot& TSnapshotSchema::GetSnapshot() const {
42-
return Snapshot;
43-
}
44-
45-
ui32 TSnapshotSchema::GetColumnsCount() const {
46-
return Schema->num_fields();
47-
}
48-
49-
ui64 TSnapshotSchema::GetVersion() const {
50-
return IndexInfo.GetVersion();
51-
}
52-
53-
}

0 commit comments

Comments
 (0)