diff --git a/ydb/core/formats/arrow/common/accessor.cpp b/ydb/core/formats/arrow/accessor/abstract/accessor.cpp similarity index 68% rename from ydb/core/formats/arrow/common/accessor.cpp rename to ydb/core/formats/arrow/accessor/abstract/accessor.cpp index 775cffa95bab..44ed07e7e3f1 100644 --- a/ydb/core/formats/arrow/common/accessor.cpp +++ b/ydb/core/formats/arrow/accessor/abstract/accessor.cpp @@ -5,6 +5,8 @@ #include #include #include +#include +#include namespace NKikimr::NArrow::NAccessor { @@ -72,64 +74,32 @@ const std::partial_ordering IChunkedArray::TAddress::Compare(const TAddress& ite return TComparator::TypedCompare(*Array, Position, *item.Array, item.Position); } -namespace { -class TChunkAccessor { -private: - std::shared_ptr ChunkedArray; -public: - TChunkAccessor(const std::shared_ptr& chunkedArray) - : ChunkedArray(chunkedArray) - { - - } - ui64 GetChunksCount() const { - return (ui64)ChunkedArray->num_chunks(); - } - ui64 GetChunkLength(const ui32 idx) const { - return (ui64)ChunkedArray->chunk(idx)->length(); - } - std::shared_ptr GetArray(const ui32 idx) const { - return ChunkedArray->chunk(idx); - } -}; - -} - -std::optional TTrivialArray::DoGetRawSize() const { - return NArrow::GetArrayDataSize(Array); + TChunkedArraySerialized::TChunkedArraySerialized(const std::shared_ptr& array, const TString& serializedData) + : Array(array) + , SerializedData(serializedData) { + AFL_VERIFY(serializedData); + AFL_VERIFY(Array); + AFL_VERIFY(Array->GetRecordsCount()); } std::partial_ordering IChunkedArray::TCurrentChunkAddress::Compare(const ui64 position, const TCurrentChunkAddress& item, const ui64 itemPosition) const { - AFL_VERIFY(StartPosition <= position); - AFL_VERIFY(position < FinishPosition); - AFL_VERIFY(item.StartPosition <= itemPosition); - AFL_VERIFY(itemPosition < item.FinishPosition); - return TComparator::TypedCompare(*Array, position - StartPosition, *item.Array, itemPosition - item.StartPosition); + AFL_VERIFY(GetStartPosition() <= position)("pos", position)("start", GetStartPosition()); + AFL_VERIFY(position < GetFinishPosition())("pos", position)("finish", GetFinishPosition()); + AFL_VERIFY(item.GetStartPosition() <= itemPosition)("start", item.GetStartPosition())("item", itemPosition); + AFL_VERIFY(itemPosition < item.GetFinishPosition())("item", itemPosition)("finish", item.GetFinishPosition()); + return TComparator::TypedCompare(*Array, position - GetStartPosition(), *item.Array, itemPosition - item.GetStartPosition()); } std::shared_ptr IChunkedArray::TCurrentChunkAddress::CopyRecord(const ui64 recordIndex) const { - AFL_VERIFY(StartPosition <= recordIndex); - AFL_VERIFY(recordIndex < FinishPosition); - return NArrow::CopyRecords(Array, { recordIndex - StartPosition }); + AFL_VERIFY(GetStartPosition() <= recordIndex); + AFL_VERIFY(recordIndex < GetFinishPosition()); + return NArrow::CopyRecords(Array, { recordIndex - GetStartPosition() }); } TString IChunkedArray::TCurrentChunkAddress::DebugString(const ui64 position) const { - AFL_VERIFY(position < FinishPosition); - AFL_VERIFY(StartPosition <= position); - return NArrow::DebugString(Array, position - StartPosition); -} - -IChunkedArray::TCurrentChunkAddress TTrivialChunkedArray::DoGetChunk(const std::optional& chunkCurrent, const ui64 position) const { - TChunkAccessor accessor(Array); - return SelectChunk(chunkCurrent, position, accessor); -} - -std::optional TTrivialChunkedArray::DoGetRawSize() const { - ui64 result = 0; - for (auto&& i : Array->chunks()) { - result += NArrow::GetArrayDataSize(i); - } - return result; + AFL_VERIFY(position < GetFinishPosition()); + AFL_VERIFY(GetStartPosition() <= position); + return NArrow::DebugString(Array, position - GetStartPosition()); } } diff --git a/ydb/core/formats/arrow/common/accessor.h b/ydb/core/formats/arrow/accessor/abstract/accessor.h similarity index 54% rename from ydb/core/formats/arrow/common/accessor.h rename to ydb/core/formats/arrow/accessor/abstract/accessor.h index 6021f47f5a88..79a105bc6178 100644 --- a/ydb/core/formats/arrow/common/accessor.h +++ b/ydb/core/formats/arrow/accessor/abstract/accessor.h @@ -1,58 +1,98 @@ #pragma once #include -#include +#include -#include #include -#include +#include +#include +#include +#include namespace NKikimr::NArrow::NAccessor { +class TColumnSaver; +class IChunkedArray; + +class TChunkedArraySerialized { +private: + YDB_READONLY_DEF(std::shared_ptr, Array); + YDB_READONLY_DEF(TString, SerializedData); + +public: + TChunkedArraySerialized(const std::shared_ptr& array, const TString& serializedData); +}; + class IChunkedArray { public: enum class EType { Undefined, Array, ChunkedArray, - SerializedChunkedArray + SerializedChunkedArray, + SparsedArray }; - class TCurrentChunkAddress { + class TCommonChunkAddress { private: - YDB_READONLY_DEF(std::shared_ptr, Array); YDB_READONLY(ui64, StartPosition, 0); YDB_READONLY(ui64, FinishPosition, 0); YDB_READONLY(ui64, ChunkIndex, 0); + public: - TString DebugString(const ui64 position) const; + TString DebugString() const { + return TStringBuilder() << "start=" << StartPosition << ";" + << "chunk_index=" << ChunkIndex << ";" + << "finish=" << FinishPosition << ";" + << "size=" << FinishPosition - StartPosition << ";"; + } ui64 GetLength() const { - return Array->length(); + return FinishPosition - StartPosition; } bool Contains(const ui64 position) const { return position >= StartPosition && position < FinishPosition; } + TCommonChunkAddress(const ui64 start, const ui64 finish, const ui64 index) + : StartPosition(start) + , FinishPosition(finish) + , ChunkIndex(index) { + AFL_VERIFY(FinishPosition > StartPosition); + } + }; + + class TCurrentArrayAddress: public TCommonChunkAddress { + private: + YDB_READONLY_DEF(std::shared_ptr, Array); + + public: + TCurrentArrayAddress(const std::shared_ptr& arr, const ui32 pos, const ui32 idx) + : TCommonChunkAddress(pos, pos + TValidator::CheckNotNull(arr)->GetRecordsCount(), idx) + , Array(arr) { + AFL_VERIFY(Array); + AFL_VERIFY(Array->GetRecordsCount()); + } + }; + + class TCurrentChunkAddress: public TCommonChunkAddress { + private: + using TBase = TCommonChunkAddress; + YDB_READONLY_DEF(std::shared_ptr, Array); + + public: + using TBase::DebugString; + TString DebugString(const ui64 position) const; + std::shared_ptr CopyRecord(const ui64 recordIndex) const; std::partial_ordering Compare(const ui64 position, const TCurrentChunkAddress& item, const ui64 itemPosition) const; TCurrentChunkAddress(const std::shared_ptr& arr, const ui64 pos, const ui32 chunkIdx) - : Array(arr) - , StartPosition(pos) - , ChunkIndex(chunkIdx) - { - AFL_VERIFY(arr); - AFL_VERIFY(arr->length()); - FinishPosition = StartPosition + arr->length(); - } - - TString DebugString() const { - return TStringBuilder() - << "start=" << StartPosition << ";" - << "chunk_index=" << ChunkIndex << ";" - << "length=" << Array->length() << ";"; + : TCommonChunkAddress(pos, pos + TValidator::CheckNotNull(arr)->length(), chunkIdx) + , Array(arr) { + AFL_VERIFY(Array); + AFL_VERIFY(Array->length()); } }; @@ -61,6 +101,7 @@ class IChunkedArray { YDB_READONLY_DEF(std::shared_ptr, Array); YDB_READONLY(ui64, Position, 0); YDB_READONLY(ui64, ChunkIdx, 0); + public: bool NextPosition() { if (Position + 1 < (ui32)Array->length()) { @@ -73,30 +114,37 @@ class IChunkedArray { TAddress(const std::shared_ptr& arr, const ui64 position, const ui64 chunkIdx) : Array(arr) , Position(position) - , ChunkIdx(chunkIdx) - { - + , ChunkIdx(chunkIdx) { } const std::partial_ordering Compare(const TAddress& item) const; }; + private: YDB_READONLY_DEF(std::shared_ptr, DataType); YDB_READONLY(ui64, RecordsCount, 0); YDB_READONLY(EType, Type, EType::Undefined); virtual std::optional DoGetRawSize() const = 0; + virtual std::shared_ptr DoGetScalar(const ui32 index) const = 0; + protected: virtual std::shared_ptr DoGetChunkedArray() const = 0; + virtual TCurrentArrayAddress DoGetArray( + const std::optional& chunkCurrent, const ui64 position, const std::shared_ptr& selfPtr) const = 0; virtual TCurrentChunkAddress DoGetChunk(const std::optional& chunkCurrent, const ui64 position) const = 0; + virtual std::shared_ptr DoGetMaxScalar() const = 0; + virtual std::vector DoSplitBySizes( + const TColumnSaver& saver, const TString& fullSerializedData, const std::vector& splitSizes) = 0; - template - TCurrentChunkAddress SelectChunk(const std::optional& chunkCurrent, const ui64 position, const TChunkAccessor& accessor) const { - if (!chunkCurrent || position >= chunkCurrent->GetStartPosition()) { + template + void SelectChunk(const std::optional& chunkCurrent, const ui64 position, const TChunkAccessor& accessor) const { + if (!chunkCurrent || chunkCurrent->GetStartPosition() <= position) { ui32 startIndex = 0; ui64 idx = 0; if (chunkCurrent) { if (position < chunkCurrent->GetFinishPosition()) { - return *chunkCurrent; + return accessor.OnArray( + chunkCurrent->GetChunkIndex(), chunkCurrent->GetStartPosition(), position - chunkCurrent->GetStartPosition()); } AFL_VERIFY(chunkCurrent->GetChunkIndex() < accessor.GetChunksCount()); startIndex = chunkCurrent->GetChunkIndex(); @@ -105,7 +153,7 @@ class IChunkedArray { for (ui32 i = startIndex; i < accessor.GetChunksCount(); ++i) { const ui64 nextIdx = idx + accessor.GetChunkLength(i); if (idx <= position && position < nextIdx) { - return TCurrentChunkAddress(accessor.GetArray(i), idx, i); + return accessor.OnArray(i, idx, position - idx); } idx = nextIdx; } @@ -116,7 +164,7 @@ class IChunkedArray { AFL_VERIFY(idx >= accessor.GetChunkLength(i))("idx", idx)("length", accessor.GetChunkLength(i)); const ui64 nextIdx = idx - accessor.GetChunkLength(i); if (nextIdx <= position && position < idx) { - return TCurrentChunkAddress(accessor.GetArray(i), nextIdx, i); + return accessor.OnArray(i, nextIdx, position - nextIdx); } idx = nextIdx; } @@ -131,21 +179,20 @@ class IChunkedArray { if (chunkCurrent) { chunkCurrentInfo << chunkCurrent->DebugString(); } - AFL_VERIFY(recordsCountChunks == GetRecordsCount())("pos", position)("count", GetRecordsCount())("chunks_map", sb)("chunk_current", chunkCurrentInfo); + AFL_VERIFY(recordsCountChunks == GetRecordsCount())("pos", position)("count", GetRecordsCount())("chunks_map", sb)( + "chunk_current", chunkCurrentInfo); AFL_VERIFY(false)("pos", position)("count", GetRecordsCount())("chunks_map", sb)("chunk_current", chunkCurrentInfo); - return TCurrentChunkAddress(nullptr, 0, 0); } public: - class TReader { private: std::shared_ptr ChunkedArray; mutable std::optional CurrentChunkAddress; + public: TReader(const std::shared_ptr& data) - : ChunkedArray(data) - { + : ChunkedArray(data) { AFL_VERIFY(ChunkedArray); } @@ -154,16 +201,38 @@ class IChunkedArray { } TAddress GetReadChunk(const ui64 position) const; - static std::partial_ordering CompareColumns(const std::vector& l, const ui64 lPosition, const std::vector& r, const ui64 rPosition); + static std::partial_ordering CompareColumns( + const std::vector& l, const ui64 lPosition, const std::vector& r, const ui64 rPosition); void AppendPositionTo(arrow::ArrayBuilder& builder, const ui64 position, ui64* recordSize) const; std::shared_ptr CopyRecord(const ui64 recordIndex) const; TString DebugString(const ui32 position) const; }; + std::shared_ptr GetScalar(const ui32 index) const { + AFL_VERIFY(index < GetRecordsCount()); + return DoGetScalar(index); + } + + std::vector SplitBySizes( + const TColumnSaver& saver, const TString& fullSerializedData, const std::vector& splitSizes) { + return DoSplitBySizes(saver, fullSerializedData, splitSizes); + } + + std::shared_ptr GetMaxScalar() const { + AFL_VERIFY(GetRecordsCount()); + return DoGetMaxScalar(); + } + std::optional GetRawSize() const { return DoGetRawSize(); } + ui64 GetRawSizeVerified() const { + auto result = GetRawSize(); + AFL_VERIFY(result); + return *result; + } + std::shared_ptr GetChunkedArray() const { return DoGetChunkedArray(); } @@ -171,7 +240,14 @@ class IChunkedArray { std::shared_ptr Slice(const ui32 offset, const ui32 count) const; + TCurrentArrayAddress GetArray( + const std::optional& chunkCurrent, const ui64 position, const std::shared_ptr& selfPtr) const { + AFL_VERIFY(position < GetRecordsCount()); + return DoGetArray(chunkCurrent, position, selfPtr); + } + TCurrentChunkAddress GetChunk(const std::optional& chunkCurrent, const ui64 position) const { + AFL_VERIFY(position < GetRecordsCount()); return DoGetChunk(chunkCurrent, position); } @@ -179,49 +255,7 @@ class IChunkedArray { : DataType(dataType) , RecordsCount(recordsCount) , Type(type) { - - } -}; - -class TTrivialArray: public IChunkedArray { -private: - using TBase = IChunkedArray; - const std::shared_ptr Array; -protected: - virtual std::optional DoGetRawSize() const override; - - virtual TCurrentChunkAddress DoGetChunk(const std::optional& /*chunkCurrent*/, const ui64 /*position*/) const override { - return TCurrentChunkAddress(Array, 0, 0); - } - virtual std::shared_ptr DoGetChunkedArray() const override { - return std::make_shared(Array); - } - -public: - TTrivialArray(const std::shared_ptr& data) - : TBase(data->length(), EType::Array, data->type()) - , Array(data) { - - } -}; - -class TTrivialChunkedArray: public IChunkedArray { -private: - using TBase = IChunkedArray; - const std::shared_ptr Array; -protected: - virtual TCurrentChunkAddress DoGetChunk(const std::optional& chunkCurrent, const ui64 position) const override; - virtual std::shared_ptr DoGetChunkedArray() const override { - return Array; - } - virtual std::optional DoGetRawSize() const override; - -public: - TTrivialChunkedArray(const std::shared_ptr& data) - : TBase(data->length(), EType::ChunkedArray, data->type()) - , Array(data) { - } }; -} +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/abstract/constructor.cpp b/ydb/core/formats/arrow/accessor/abstract/constructor.cpp new file mode 100644 index 000000000000..51c2e86cadf7 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/abstract/constructor.cpp @@ -0,0 +1,11 @@ +#include "constructor.h" +#include + +namespace NKikimr::NArrow::NAccessor { + +TConstructorContainer TConstructorContainer::GetDefaultConstructor() { + static std::shared_ptr result = std::make_shared(); + return result; +} + +} diff --git a/ydb/core/formats/arrow/accessor/abstract/constructor.h b/ydb/core/formats/arrow/accessor/abstract/constructor.h new file mode 100644 index 000000000000..7f9883402c25 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/abstract/constructor.h @@ -0,0 +1,76 @@ +#pragma once +#include "accessor.h" + +#include +#include + +#include + +#include + +namespace NKikimr::NArrow::NAccessor { + +class IConstructor { +public: + using TFactory = NObjectFactory::TObjectFactory; + using TProto = NKikimrArrowAccessorProto::TConstructor; + +private: + virtual TConclusion> DoConstruct( + const std::shared_ptr& originalData, const TChunkConstructionData& externalInfo) const = 0; + virtual TConclusion> DoConstructDefault( + const TChunkConstructionData& externalInfo) const = 0; + virtual NKikimrArrowAccessorProto::TConstructor DoSerializeToProto() const = 0; + virtual bool DoDeserializeFromProto(const NKikimrArrowAccessorProto::TConstructor& proto) = 0; + virtual std::shared_ptr DoGetExpectedSchema(const std::shared_ptr& resultColumn) const = 0; + virtual TString DoDebugString() const { + return ""; + } + +public: + virtual ~IConstructor() = default; + + TString DebugString() const { + return TStringBuilder() << GetClassName() << ":" << DoDebugString(); + } + + TConclusion> Construct( + const std::shared_ptr& originalData, const TChunkConstructionData& externalInfo) const { + return DoConstruct(originalData, externalInfo); + } + + TConclusion> ConstructDefault(const TChunkConstructionData& externalInfo) const { + return DoConstructDefault(externalInfo); + } + + bool DeserializeFromProto(const NKikimrArrowAccessorProto::TConstructor& proto) { + return DoDeserializeFromProto(proto); + } + + NKikimrArrowAccessorProto::TConstructor SerializeToProto() const { + return DoSerializeToProto(); + } + + void SerializeToProto(NKikimrArrowAccessorProto::TConstructor& proto) const { + proto = DoSerializeToProto(); + } + + std::shared_ptr GetExpectedSchema(const std::shared_ptr& resultColumn) const { + AFL_VERIFY(resultColumn); + return DoGetExpectedSchema(resultColumn); + } + + virtual TString GetClassName() const = 0; +}; + +class TConstructorContainer: public NBackgroundTasks::TInterfaceProtoContainer { +private: + using TBase = NBackgroundTasks::TInterfaceProtoContainer; + +public: + using TBase::TBase; + + static TConstructorContainer GetDefaultConstructor(); +}; + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/abstract/request.cpp b/ydb/core/formats/arrow/accessor/abstract/request.cpp new file mode 100644 index 000000000000..08bc3ee5c87d --- /dev/null +++ b/ydb/core/formats/arrow/accessor/abstract/request.cpp @@ -0,0 +1,16 @@ +#include "request.h" + +namespace NKikimr::NArrow::NAccessor { + +TConclusionStatus TRequestedConstructorContainer::DeserializeFromRequest(NYql::TFeaturesExtractor& features) { + const std::optional className = features.Extract("DATA_ACCESSOR_CONSTRUCTOR.CLASS_NAME"); + if (!className) { + return TConclusionStatus::Success(); + } + if (!TBase::Initialize(*className)) { + return TConclusionStatus::Fail("don't know anything about class_name=" + *className); + } + return TBase::GetObjectPtr()->DeserializeFromRequest(features); +} + +} diff --git a/ydb/core/formats/arrow/accessor/abstract/request.h b/ydb/core/formats/arrow/accessor/abstract/request.h new file mode 100644 index 000000000000..c13105fe8e21 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/abstract/request.h @@ -0,0 +1,58 @@ +#pragma once +#include "constructor.h" + +#include + +#include +#include + +#include + +namespace NKikimr::NArrow::NAccessor { + +class IRequestedConstructor { +public: + using TFactory = NObjectFactory::TObjectFactory; + using TProto = NKikimrArrowAccessorProto::TRequestedConstructor; +private: + virtual TConclusion DoBuildConstructor() const = 0; + virtual NKikimrArrowAccessorProto::TRequestedConstructor DoSerializeToProto() const = 0; + virtual bool DoDeserializeFromProto(const NKikimrArrowAccessorProto::TRequestedConstructor& proto) = 0; + virtual TConclusionStatus DoDeserializeFromRequest(NYql::TFeaturesExtractor& features) = 0; + +public: + virtual ~IRequestedConstructor() = default; + + NKikimrArrowAccessorProto::TRequestedConstructor SerializeToProto() const { + return DoSerializeToProto(); + } + + void SerializeToProto(NKikimrArrowAccessorProto::TRequestedConstructor& proto) const { + proto = DoSerializeToProto(); + } + + bool DeserializeFromProto(const NKikimrArrowAccessorProto::TRequestedConstructor& proto) { + return DoDeserializeFromProto(proto); + } + + TConclusionStatus DeserializeFromRequest(NYql::TFeaturesExtractor& features) { + return DoDeserializeFromRequest(features); + } + + TConclusion BuildConstructor() const { + return DoBuildConstructor(); + } + + virtual TString GetClassName() const = 0; +}; + +class TRequestedConstructorContainer: public NBackgroundTasks::TInterfaceProtoContainer { +private: + using TBase = NBackgroundTasks::TInterfaceProtoContainer; + +public: + using TBase::TBase; + TConclusionStatus DeserializeFromRequest(NYql::TFeaturesExtractor& features); +}; + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/abstract/ya.make b/ydb/core/formats/arrow/accessor/abstract/ya.make new file mode 100644 index 000000000000..fd68f1eeb3bf --- /dev/null +++ b/ydb/core/formats/arrow/accessor/abstract/ya.make @@ -0,0 +1,17 @@ +LIBRARY() + +PEERDIR( + ydb/core/formats/arrow/protos + ydb/core/formats/arrow/accessor/common + contrib/libs/apache/arrow + ydb/library/conclusion + ydb/services/metadata/abstract +) + +SRCS( + accessor.cpp + constructor.cpp + request.cpp +) + +END() diff --git a/ydb/core/formats/arrow/accessor/common/chunk_data.cpp b/ydb/core/formats/arrow/accessor/common/chunk_data.cpp new file mode 100644 index 000000000000..da03037ef321 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/common/chunk_data.cpp @@ -0,0 +1,5 @@ +#include "chunk_data.h" + +namespace NKikimr::NArrow::NAccessor { + +} diff --git a/ydb/core/formats/arrow/accessor/common/chunk_data.h b/ydb/core/formats/arrow/accessor/common/chunk_data.h new file mode 100644 index 000000000000..d10d27abb85b --- /dev/null +++ b/ydb/core/formats/arrow/accessor/common/chunk_data.h @@ -0,0 +1,23 @@ +#pragma once +#include +#include +#include + +namespace NKikimr::NArrow::NAccessor { + +class TChunkConstructionData { +private: + YDB_READONLY(ui32, RecordsCount, 0); + YDB_READONLY_DEF(std::shared_ptr, DefaultValue); + YDB_READONLY_DEF(std::shared_ptr, ColumnType); + +public: + TChunkConstructionData( + const ui32 recordsCount, const std::shared_ptr& defaultValue, const std::shared_ptr& columnType) + : RecordsCount(recordsCount) + , DefaultValue(defaultValue) + , ColumnType(columnType) { + } +}; + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/common/const.cpp b/ydb/core/formats/arrow/accessor/common/const.cpp new file mode 100644 index 000000000000..926a9ca94deb --- /dev/null +++ b/ydb/core/formats/arrow/accessor/common/const.cpp @@ -0,0 +1,5 @@ +#include "const.h" + +namespace NKikimr::NArrow::NAccessor { + +} diff --git a/ydb/core/formats/arrow/accessor/common/const.h b/ydb/core/formats/arrow/accessor/common/const.h new file mode 100644 index 000000000000..192332854478 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/common/const.h @@ -0,0 +1,12 @@ +#pragma once +#include + +namespace NKikimr::NArrow::NAccessor { + +class TGlobalConst { +public: + static const inline TString SparsedDataAccessorName = "SPARSED"; + static const inline TString PlainDataAccessorName = "PLAIN"; +}; + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/common/ya.make b/ydb/core/formats/arrow/accessor/common/ya.make new file mode 100644 index 000000000000..28ef714226aa --- /dev/null +++ b/ydb/core/formats/arrow/accessor/common/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +PEERDIR( + contrib/libs/apache/arrow +) + +SRCS( + chunk_data.cpp + const.cpp +) + +END() diff --git a/ydb/core/formats/arrow/accessor/composite/accessor.cpp b/ydb/core/formats/arrow/accessor/composite/accessor.cpp new file mode 100644 index 000000000000..f4da58429753 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/composite/accessor.cpp @@ -0,0 +1,71 @@ +#include "accessor.h" +namespace NKikimr::NArrow::NAccessor { + +namespace { +class TCompositeChunkAccessor { +private: + const std::vector>& Chunks; + std::optional* ResultChunkAddress = nullptr; + std::optional* ResultArrayAddress = nullptr; + +public: + TCompositeChunkAccessor( + const std::vector>& chunks, std::optional& result) + : Chunks(chunks) + , ResultChunkAddress(&result) { + } + TCompositeChunkAccessor( + const std::vector>& chunks, std::optional& result) + : Chunks(chunks) + , ResultArrayAddress(&result) { + } + ui64 GetChunksCount() const { + return Chunks.size(); + } + ui64 GetChunkLength(const ui32 idx) const { + return Chunks[idx]->GetRecordsCount(); + } + void OnArray(const ui32 chunkIdx, const ui32 startPosition, const ui32 internalPosition) const { + if (ResultChunkAddress) { + auto internalAddress = Chunks[chunkIdx]->GetChunk({}, internalPosition); + *ResultChunkAddress = NArrow::NAccessor::IChunkedArray::TCurrentChunkAddress( + internalAddress.GetArray(), startPosition + internalAddress.GetStartPosition(), chunkIdx); + } + if (ResultArrayAddress) { + *ResultArrayAddress = NArrow::NAccessor::IChunkedArray::TCurrentArrayAddress(Chunks[chunkIdx], startPosition, chunkIdx); + } + } +}; +} // namespace + +NArrow::NAccessor::IChunkedArray::TCurrentChunkAddress TCompositeChunkedArray::DoGetChunk( + const std::optional& chunkCurrent, const ui64 position) const { + std::optional result; + TCompositeChunkAccessor accessor(Chunks, result); + SelectChunk(chunkCurrent, position, accessor); + AFL_VERIFY(result); + return *result; +} + +NArrow::NAccessor::IChunkedArray::TCurrentArrayAddress TCompositeChunkedArray::DoGetArray( + const std::optional& chunkCurrent, const ui64 position, const std::shared_ptr& /*selfPtr*/) const { + std::optional result; + TCompositeChunkAccessor accessor(Chunks, result); + SelectChunk(chunkCurrent, position, accessor); + AFL_VERIFY(result); + return *result; +} + +std::shared_ptr TCompositeChunkedArray::DoGetChunkedArray() const { + std::vector> chunks; + for (auto&& i : Chunks) { + auto arr = i->GetChunkedArray(); + AFL_VERIFY(arr->num_chunks()); + for (auto&& chunk : arr->chunks()) { + chunks.emplace_back(chunk); + } + } + return std::make_shared(chunks); +} + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/composite/accessor.h b/ydb/core/formats/arrow/accessor/composite/accessor.h new file mode 100644 index 000000000000..bea1b8d258d1 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/composite/accessor.h @@ -0,0 +1,69 @@ +#pragma once +#include +#include + +namespace NKikimr::NArrow::NAccessor { + +class TCompositeChunkedArray: public NArrow::NAccessor::IChunkedArray { +private: + using TBase = NArrow::NAccessor::IChunkedArray; + +private: + std::vector> Chunks; + +protected: + virtual TCurrentArrayAddress DoGetArray(const std::optional& chunkCurrent, const ui64 /*position*/, + const std::shared_ptr& selfPtr) const override; + + virtual std::vector DoSplitBySizes( + const TColumnSaver& /*saver*/, const TString& /*fullSerializedData*/, const std::vector& /*splitSizes*/) override { + AFL_VERIFY(false); + return {}; + } + + virtual std::shared_ptr DoGetScalar(const ui32 index) const override { + AFL_VERIFY(false)("problem", "cannot use method"); + return nullptr; + } + virtual std::optional DoGetRawSize() const override { + return {}; + } + virtual std::shared_ptr DoGetMaxScalar() const override { + AFL_VERIFY(false); + return nullptr; + } + virtual TCurrentChunkAddress DoGetChunk(const std::optional& chunkCurrent, const ui64 position) const override; + virtual std::shared_ptr DoGetChunkedArray() const override; + + TCompositeChunkedArray(std::vector>&& chunks, const ui32 recordsCount, + const std::shared_ptr& type) + : TBase(recordsCount, NArrow::NAccessor::IChunkedArray::EType::SerializedChunkedArray, type) + , Chunks(std::move(chunks)) { + } + +public: + class TBuilder { + private: + ui32 RecordsCount = 0; + std::vector> Chunks; + const std::shared_ptr Type; + + public: + TBuilder(const std::shared_ptr& type) + : Type(type) { + AFL_VERIFY(Type); + } + + void AddChunk(const std::shared_ptr& arr) { + AFL_VERIFY(arr->GetDataType()->id() == Type->id())("incoming", arr->GetDataType()->ToString())("main", Type->ToString()); + Chunks.emplace_back(arr); + RecordsCount += arr->GetRecordsCount(); + } + + std::shared_ptr Finish() { + return std::shared_ptr(new TCompositeChunkedArray(std::move(Chunks), RecordsCount, Type)); + } + }; +}; + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/composite/ya.make b/ydb/core/formats/arrow/accessor/composite/ya.make new file mode 100644 index 000000000000..828c9a8e531d --- /dev/null +++ b/ydb/core/formats/arrow/accessor/composite/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +PEERDIR( + contrib/libs/apache/arrow + ydb/core/formats/arrow/common +) + +SRCS( + accessor.cpp +) + +END() diff --git a/ydb/core/formats/arrow/accessor/composite_serial/accessor.cpp b/ydb/core/formats/arrow/accessor/composite_serial/accessor.cpp new file mode 100644 index 000000000000..966c46bfe56d --- /dev/null +++ b/ydb/core/formats/arrow/accessor/composite_serial/accessor.cpp @@ -0,0 +1,60 @@ +#include "accessor.h" + +namespace NKikimr::NArrow::NAccessor { + +namespace { +class TSerializedChunkAccessor { +private: + const std::vector& Chunks; + const std::shared_ptr& Loader; + TDeserializeChunkedArray::TChunkCacheInfo* CachedDataOwner; + std::optional* ResultAddress; + +public: + TSerializedChunkAccessor(const std::vector& chunks, const std::shared_ptr& loader, + TDeserializeChunkedArray::TChunkCacheInfo* cachedDataOwner, + std::optional* resultAddress) + : Chunks(chunks) + , Loader(loader) + , CachedDataOwner(cachedDataOwner) + , ResultAddress(resultAddress) { + } + ui64 GetChunksCount() const { + return Chunks.size(); + } + ui64 GetChunkLength(const ui32 idx) const { + return Chunks[idx].GetRecordsCount(); + } + void OnArray(const ui32 chunkIdx, const ui32 startPosition, const ui32 internalPosition) const { + if (!CachedDataOwner->GetChunk() || CachedDataOwner->GetIndex() != chunkIdx) { + CachedDataOwner->SetChunk(Chunks[chunkIdx].GetArrayVerified(Loader)); + CachedDataOwner->SetIndex(chunkIdx); + CachedDataOwner->SetStartPosition(startPosition); + } + if (ResultAddress) { + auto addressInternal = CachedDataOwner->GetChunk()->GetChunk({}, internalPosition); + *ResultAddress = NArrow::NAccessor::IChunkedArray::TCurrentChunkAddress( + addressInternal.GetArray(), startPosition + addressInternal.GetStartPosition(), chunkIdx); + } + } +}; +} // namespace + +NArrow::NAccessor::IChunkedArray::TCurrentChunkAddress TDeserializeChunkedArray::DoGetChunk( + const std::optional& chunkCurrent, const ui64 position) const { + std::optional result; + TSerializedChunkAccessor accessor(Chunks, Loader, CurrentChunkCache.get(), &result); + SelectChunk(chunkCurrent, position, accessor); + AFL_VERIFY(result); + return *result; +} + +NArrow::NAccessor::IChunkedArray::TCurrentArrayAddress TDeserializeChunkedArray::DoGetArray( + const std::optional& chunkCurrent, const ui64 position, const std::shared_ptr& /*selfPtr*/) const { + TSerializedChunkAccessor accessor(Chunks, Loader, CurrentChunkCache.get(), nullptr); + SelectChunk(chunkCurrent, position, accessor); + return IChunkedArray::TCurrentArrayAddress( + CurrentChunkCache->GetChunk(), CurrentChunkCache->GetStartPosition(), CurrentChunkCache->GetIndex()); +} + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/composite_serial/accessor.h b/ydb/core/formats/arrow/accessor/composite_serial/accessor.h new file mode 100644 index 000000000000..d7c0e22150ba --- /dev/null +++ b/ydb/core/formats/arrow/accessor/composite_serial/accessor.h @@ -0,0 +1,88 @@ +#pragma once +#include +#include + +namespace NKikimr::NArrow::NAccessor { + +class TDeserializeChunkedArray: public NArrow::NAccessor::IChunkedArray { +private: + using TBase = NArrow::NAccessor::IChunkedArray; + +public: + class TChunkCacheInfo { + private: + YDB_ACCESSOR_DEF(std::shared_ptr, Chunk); + YDB_ACCESSOR(ui32, Index, 0); + YDB_ACCESSOR(ui32, StartPosition, 0); + + public: + }; + + class TChunk { + private: + YDB_READONLY(ui32, RecordsCount, 0); + std::shared_ptr PredefinedArray; + const TString Data; + + public: + TChunk(const std::shared_ptr& predefinedArray) + : PredefinedArray(predefinedArray) { + AFL_VERIFY(PredefinedArray); + RecordsCount = PredefinedArray->GetRecordsCount(); + } + + TChunk(const ui32 recordsCount, const TString& data) + : RecordsCount(recordsCount) + , Data(data) { + } + + std::shared_ptr GetArrayVerified(const std::shared_ptr& loader) const { + if (PredefinedArray) { + return PredefinedArray; + } + return loader->ApplyVerified(Data, RecordsCount); + } + }; + +private: + std::shared_ptr Loader; + std::vector Chunks; + std::shared_ptr CurrentChunkCache = std::make_shared(); + +protected: + virtual TCurrentArrayAddress DoGetArray(const std::optional& chunkCurrent, const ui64 /*position*/, + const std::shared_ptr& selfPtr) const override; + + virtual std::vector DoSplitBySizes( + const TColumnSaver& /*saver*/, const TString& /*fullSerializedData*/, const std::vector& /*splitSizes*/) override { + AFL_VERIFY(false); + return {}; + } + + virtual std::shared_ptr DoGetScalar(const ui32 index) const override { + AFL_VERIFY(false)("problem", "cannot use method"); + return nullptr; + } + virtual std::optional DoGetRawSize() const override { + return {}; + } + virtual std::shared_ptr DoGetMaxScalar() const override { + AFL_VERIFY(false); + return nullptr; + } + virtual TCurrentChunkAddress DoGetChunk(const std::optional& chunkCurrent, const ui64 position) const override; + virtual std::shared_ptr DoGetChunkedArray() const override { + AFL_VERIFY(false); + return nullptr; + } + +public: + TDeserializeChunkedArray(const ui64 recordsCount, const std::shared_ptr& loader, std::vector&& chunks) + : TBase(recordsCount, NArrow::NAccessor::IChunkedArray::EType::SerializedChunkedArray, loader->GetField()->type()) + , Loader(loader) + , Chunks(std::move(chunks)) { + AFL_VERIFY(Loader); + } +}; + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/composite_serial/ya.make b/ydb/core/formats/arrow/accessor/composite_serial/ya.make new file mode 100644 index 000000000000..49c2e1e41ea4 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/composite_serial/ya.make @@ -0,0 +1,13 @@ +LIBRARY() + +PEERDIR( + contrib/libs/apache/arrow + ydb/core/formats/arrow/common + ydb/core/formats/arrow/save_load +) + +SRCS( + accessor.cpp +) + +END() diff --git a/ydb/core/formats/arrow/accessor/plain/accessor.cpp b/ydb/core/formats/arrow/accessor/plain/accessor.cpp new file mode 100644 index 000000000000..ec0547a9bcd9 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/plain/accessor.cpp @@ -0,0 +1,88 @@ +#include "accessor.h" + +#include +#include +#include + +namespace NKikimr::NArrow::NAccessor { + +std::optional TTrivialArray::DoGetRawSize() const { + return NArrow::GetArrayDataSize(Array); +} + +std::vector TTrivialArray::DoSplitBySizes( + const TColumnSaver& saver, const TString& fullSerializedData, const std::vector& splitSizes) { + auto schema = std::make_shared(arrow::FieldVector({ std::make_shared("f", GetDataType()) })); + auto chunks = NArrow::NSplitter::TSimpleSplitter(saver).SplitBySizes( + arrow::RecordBatch::Make(schema, GetRecordsCount(), { Array }), fullSerializedData, splitSizes); + std::vector result; + for (auto&& i : chunks) { + AFL_VERIFY(i.GetSlicedBatch()->num_columns() == 1); + result.emplace_back(std::make_shared(i.GetSlicedBatch()->column(0)), i.GetSerializedChunk()); + } + return result; +} + +std::shared_ptr TTrivialArray::DoGetMaxScalar() const { + auto minMaxPos = NArrow::FindMinMaxPosition(Array); + return NArrow::TStatusValidator::GetValid(Array->GetScalar(minMaxPos.second)); +} + +namespace { +class TChunkAccessor { +private: + std::shared_ptr ChunkedArray; + std::optional* Result; + +public: + TChunkAccessor(const std::shared_ptr& chunkedArray, std::optional& result) + : ChunkedArray(chunkedArray) + , Result(&result) { + } + ui64 GetChunksCount() const { + return (ui64)ChunkedArray->num_chunks(); + } + ui64 GetChunkLength(const ui32 idx) const { + return (ui64)ChunkedArray->chunk(idx)->length(); + } + void OnArray(const ui32 idx, const ui32 startPosition, const ui32 /*internalPosition*/) const { + *Result = IChunkedArray::TCurrentChunkAddress(ChunkedArray->chunk(idx), startPosition, idx); + } +}; + +} // namespace + +IChunkedArray::TCurrentChunkAddress TTrivialChunkedArray::DoGetChunk( + const std::optional& chunkCurrent, const ui64 position) const { + std::optional result; + TChunkAccessor accessor(Array, result); + SelectChunk(chunkCurrent, position, accessor); + AFL_VERIFY(result); + return *result; +} + +std::optional TTrivialChunkedArray::DoGetRawSize() const { + ui64 result = 0; + for (auto&& i : Array->chunks()) { + result += NArrow::GetArrayDataSize(i); + } + return result; +} + +std::shared_ptr TTrivialChunkedArray::DoGetMaxScalar() const { + std::shared_ptr result; + for (auto&& i : Array->chunks()) { + if (!i->length()) { + continue; + } + auto minMaxPos = NArrow::FindMinMaxPosition(i); + auto scalarCurrent = NArrow::TStatusValidator::GetValid(i->GetScalar(minMaxPos.second)); + if (!result || ScalarCompare(result, scalarCurrent) < 0) { + result = scalarCurrent; + } + } + + return result; +} + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/plain/accessor.h b/ydb/core/formats/arrow/accessor/plain/accessor.h new file mode 100644 index 000000000000..349f3b7d5ee6 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/plain/accessor.h @@ -0,0 +1,76 @@ +#pragma once +#include +#include + +namespace NKikimr::NArrow::NAccessor { + +class TTrivialArray: public IChunkedArray { +private: + using TBase = IChunkedArray; + const std::shared_ptr Array; + +protected: + virtual std::optional DoGetRawSize() const override; + + virtual TCurrentChunkAddress DoGetChunk( + const std::optional& /*chunkCurrent*/, const ui64 /*position*/) const override { + return TCurrentChunkAddress(Array, 0, 0); + } + virtual std::shared_ptr DoGetChunkedArray() const override { + return std::make_shared(Array); + } + virtual std::shared_ptr DoGetScalar(const ui32 index) const override { + return NArrow::TStatusValidator::GetValid(Array->GetScalar(index)); + } + virtual std::shared_ptr DoGetMaxScalar() const override; + virtual std::vector DoSplitBySizes( + const TColumnSaver& saver, const TString& fullSerializedData, const std::vector& splitSizes) override; + + virtual TCurrentArrayAddress DoGetArray(const std::optional& /*chunkCurrent*/, const ui64 /*position*/, + const std::shared_ptr& selfPtr) const override { + return TCurrentArrayAddress(selfPtr, 0, 0); + } + +public: + TTrivialArray(const std::shared_ptr& data) + : TBase(data->length(), EType::Array, data->type()) + , Array(data) { + } +}; + +class TTrivialChunkedArray: public IChunkedArray { +private: + using TBase = IChunkedArray; + const std::shared_ptr Array; + +protected: + virtual TCurrentChunkAddress DoGetChunk(const std::optional& chunkCurrent, const ui64 position) const override; + virtual std::shared_ptr DoGetChunkedArray() const override { + return Array; + } + virtual std::optional DoGetRawSize() const override; + virtual std::shared_ptr DoGetScalar(const ui32 index) const override { + auto chunk = GetChunk({}, index); + return NArrow::TStatusValidator::GetValid(chunk.GetArray()->GetScalar(index - chunk.GetStartPosition())); + } + virtual std::vector DoSplitBySizes( + const TColumnSaver& /*saver*/, const TString& /*fullSerializedData*/, const std::vector& /*splitSizes*/) override { + AFL_VERIFY(false); + return {}; + } + + virtual std::shared_ptr DoGetMaxScalar() const override; + + virtual TCurrentArrayAddress DoGetArray(const std::optional& /*chunkCurrent*/, const ui64 /*position*/, + const std::shared_ptr& selfPtr) const override { + return TCurrentArrayAddress(selfPtr, 0, 0); + } + +public: + TTrivialChunkedArray(const std::shared_ptr& data) + : TBase(data->length(), EType::ChunkedArray, data->type()) + , Array(data) { + } +}; + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/plain/constructor.cpp b/ydb/core/formats/arrow/accessor/plain/constructor.cpp new file mode 100644 index 000000000000..7e756d1f30bf --- /dev/null +++ b/ydb/core/formats/arrow/accessor/plain/constructor.cpp @@ -0,0 +1,34 @@ +#include "accessor.h" +#include "constructor.h" + +#include +#include + +#include + +namespace NKikimr::NArrow::NAccessor::NPlain { + +TConclusion> TConstructor::DoConstruct( + const std::shared_ptr& originalData, const TChunkConstructionData& /*externalInfo*/) const { + AFL_VERIFY(originalData->num_columns() == 1)("count", originalData->num_columns())("schema", originalData->schema()->ToString()); + return std::make_shared(originalData->column(0)); +} + +TConclusion> TConstructor::DoConstructDefault(const TChunkConstructionData& externalInfo) const { + return std::make_shared( + NArrow::TThreadSimpleArraysCache::Get(externalInfo.GetColumnType(), externalInfo.GetDefaultValue(), externalInfo.GetRecordsCount())); +} + +NKikimrArrowAccessorProto::TConstructor TConstructor::DoSerializeToProto() const { + return NKikimrArrowAccessorProto::TConstructor(); +} + +bool TConstructor::DoDeserializeFromProto(const NKikimrArrowAccessorProto::TConstructor& /*proto*/) { + return true; +} + +std::shared_ptr TConstructor::DoGetExpectedSchema(const std::shared_ptr& resultColumn) const { + return std::make_shared(arrow::FieldVector({ resultColumn })); +} + +} // namespace NKikimr::NArrow::NAccessor::NPlain diff --git a/ydb/core/formats/arrow/accessor/plain/constructor.h b/ydb/core/formats/arrow/accessor/plain/constructor.h new file mode 100644 index 000000000000..cf84f5021bd7 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/plain/constructor.h @@ -0,0 +1,28 @@ +#pragma once +#include +#include + +namespace NKikimr::NArrow::NAccessor::NPlain { + +class TConstructor: public IConstructor { +public: + static TString GetClassNameStatic() { + return TGlobalConst::PlainDataAccessorName; + } + +private: + static inline auto Registrator = TFactory::TRegistrator(GetClassNameStatic()); + virtual TConclusion> DoConstruct( + const std::shared_ptr& originalData, const TChunkConstructionData& externalInfo) const override; + virtual NKikimrArrowAccessorProto::TConstructor DoSerializeToProto() const override; + virtual bool DoDeserializeFromProto(const NKikimrArrowAccessorProto::TConstructor& proto) override; + virtual std::shared_ptr DoGetExpectedSchema(const std::shared_ptr& resultColumn) const override; + virtual TConclusion> DoConstructDefault(const TChunkConstructionData& externalInfo) const override; + +public: + virtual TString GetClassName() const override { + return GetClassNameStatic(); + } +}; + +} // namespace NKikimr::NArrow::NAccessor::NPlain diff --git a/ydb/core/formats/arrow/accessor/plain/request.cpp b/ydb/core/formats/arrow/accessor/plain/request.cpp new file mode 100644 index 000000000000..05a6ab128165 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/plain/request.cpp @@ -0,0 +1,22 @@ +#include "request.h" +#include "constructor.h" + +namespace NKikimr::NArrow::NAccessor::NPlain { + +NKikimrArrowAccessorProto::TRequestedConstructor TRequestedConstuctor::DoSerializeToProto() const { + return NKikimrArrowAccessorProto::TRequestedConstructor(); +} + +bool TRequestedConstuctor::DoDeserializeFromProto(const NKikimrArrowAccessorProto::TRequestedConstructor& /*proto*/) { + return true; +} + +NKikimr::TConclusionStatus TRequestedConstuctor::DoDeserializeFromRequest(NYql::TFeaturesExtractor& /*features*/) { + return TConclusionStatus::Success(); +} + +NKikimr::TConclusion TRequestedConstuctor::DoBuildConstructor() const { + return std::make_shared(); +} + +} diff --git a/ydb/core/formats/arrow/accessor/plain/request.h b/ydb/core/formats/arrow/accessor/plain/request.h new file mode 100644 index 000000000000..02f6cce8560a --- /dev/null +++ b/ydb/core/formats/arrow/accessor/plain/request.h @@ -0,0 +1,26 @@ +#pragma once +#include +#include + +namespace NKikimr::NArrow::NAccessor::NPlain { + +class TRequestedConstuctor: public IRequestedConstructor { +public: + static TString GetClassNameStatic() { + return TGlobalConst::PlainDataAccessorName; + } + +private: + static inline auto Registrator = TFactory::TRegistrator(GetClassNameStatic()); + virtual TConclusion DoBuildConstructor() const override; + virtual NKikimrArrowAccessorProto::TRequestedConstructor DoSerializeToProto() const override; + virtual bool DoDeserializeFromProto(const NKikimrArrowAccessorProto::TRequestedConstructor& /*proto*/) override; + virtual TConclusionStatus DoDeserializeFromRequest(NYql::TFeaturesExtractor& /*features*/) override; + +public: + virtual TString GetClassName() const override { + return GetClassNameStatic(); + } +}; + +} // namespace NKikimr::NArrow::NAccessor::NPlain diff --git a/ydb/core/formats/arrow/accessor/plain/ya.make b/ydb/core/formats/arrow/accessor/plain/ya.make new file mode 100644 index 000000000000..b7d600885be5 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/plain/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +PEERDIR( + ydb/core/formats/arrow/protos + ydb/core/formats/arrow/accessor/abstract +) + +SRCS( + accessor.cpp + GLOBAL constructor.cpp + GLOBAL request.cpp +) + +END() diff --git a/ydb/core/formats/arrow/accessor/sparsed/accessor.cpp b/ydb/core/formats/arrow/accessor/sparsed/accessor.cpp new file mode 100644 index 000000000000..e0c967eab077 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/sparsed/accessor.cpp @@ -0,0 +1,263 @@ +#include "accessor.h" + +#include +#include +#include +#include + +namespace NKikimr::NArrow::NAccessor { + +TSparsedArray::TSparsedArray(const IChunkedArray& defaultArray, const std::shared_ptr& defaultValue) + : TBase(defaultArray.GetRecordsCount(), EType::SparsedArray, defaultArray.GetDataType()) + , DefaultValue(defaultValue) { + if (DefaultValue) { + AFL_VERIFY(DefaultValue->type->id() == defaultArray.GetDataType()->id()); + } + std::optional current; + std::shared_ptr records; + ui32 sparsedRecordsCount = 0; + AFL_VERIFY(SwitchType(GetDataType()->id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TScalar = typename arrow::TypeTraits::ScalarType; + using TArray = typename arrow::TypeTraits::ArrayType; + using TBuilder = typename arrow::TypeTraits::BuilderType; + auto builderValue = NArrow::MakeBuilder(GetDataType()); + TBuilder* builderValueImpl = (TBuilder*)builderValue.get(); + auto builderIndex = NArrow::MakeBuilder(arrow::uint32()); + arrow::UInt32Builder* builderIndexImpl = (arrow::UInt32Builder*)builderIndex.get(); + auto scalar = static_pointer_cast(DefaultValue); + for (ui32 pos = 0; pos < GetRecordsCount();) { + current = defaultArray.GetChunk(current, pos); + auto typedArray = static_pointer_cast(current->GetArray()); + for (ui32 i = 0; i < typedArray->length(); ++i) { + std::optional isDefault; + if (scalar) { + if constexpr (arrow::has_string_view()) { + isDefault = arrow::util::string_view((char*)scalar->value->data(), scalar->value->size()) == typedArray->GetView(i); + } else if constexpr (arrow::has_c_type()) { + isDefault = scalar->value == typedArray->Value(i); + } else { + AFL_VERIFY(false)("type", GetDataType()->ToString()); + } + } else { + isDefault = typedArray->IsNull(i); + } + if (!*isDefault) { + if constexpr (arrow::has_string_view()) { + NArrow::TStatusValidator::Validate(builderValueImpl->Append(typedArray->GetView(i))); + NArrow::TStatusValidator::Validate(builderIndexImpl->Append(pos + i)); + ++sparsedRecordsCount; + } else if constexpr (arrow::has_c_type()) { + NArrow::TStatusValidator::Validate(builderValueImpl->Append(typedArray->Value(i))); + NArrow::TStatusValidator::Validate(builderIndexImpl->Append(pos + i)); + ++sparsedRecordsCount; + } else { + AFL_VERIFY(false)("type", GetDataType()->ToString()); + } + } + } + pos = current->GetFinishPosition(); + AFL_VERIFY(pos <= GetRecordsCount()); + } + std::vector> fields = { std::make_shared("index", arrow::uint32()), + std::make_shared("value", GetDataType()) }; + auto schema = std::make_shared(fields); + std::vector> columns = { NArrow::TStatusValidator::GetValid(builderIndex->Finish()), + NArrow::TStatusValidator::GetValid(builderValue->Finish()) }; + records = arrow::RecordBatch::Make(schema, sparsedRecordsCount, columns); + AFL_VERIFY_DEBUG(records->ValidateFull().ok()); + return true; + })); + AFL_VERIFY(records); + Records.emplace_back(TSparsedArrayChunk(0, GetRecordsCount(), records, DefaultValue)); +} + +std::vector TSparsedArray::DoSplitBySizes( + const TColumnSaver& saver, const TString& fullSerializedData, const std::vector& splitSizes) { + AFL_VERIFY(Records.size() == 1)("size", Records.size()); + auto chunks = NArrow::NSplitter::TSimpleSplitter(saver).SplitBySizes(Records.front().GetRecords(), fullSerializedData, splitSizes); + + std::vector result; + ui32 idx = 0; + ui32 startIdx = 0; + for (auto&& i : chunks) { + AFL_VERIFY(i.GetSlicedBatch()->num_columns() == 2); + AFL_VERIFY(i.GetSlicedBatch()->column(0)->type()->id() == arrow::uint32()->id()); + auto UI32Column = static_pointer_cast(i.GetSlicedBatch()->column(0)); + ui32 nextStartIdx = NArrow::NAccessor::TSparsedArray::GetLastIndex(i.GetSlicedBatch()) + 1; + if (idx + 1 == chunks.size()) { + nextStartIdx = GetRecordsCount(); + } + std::shared_ptr batch; + { + std::unique_ptr builder = NArrow::MakeBuilder(arrow::uint32()); + arrow::UInt32Builder* builderImpl = (arrow::UInt32Builder*)builder.get(); + for (ui32 rowIdx = 0; rowIdx < UI32Column->length(); ++rowIdx) { + TStatusValidator::Validate(builderImpl->Append(UI32Column->Value(rowIdx) - startIdx)); + } + auto colIndex = TStatusValidator::GetValid(builder->Finish()); + batch = arrow::RecordBatch::Make( + i.GetSlicedBatch()->schema(), i.GetSlicedBatch()->num_rows(), { colIndex, i.GetSlicedBatch()->column(1) }); + } + + ++idx; + { + TBuilder builder(DefaultValue, GetDataType()); + builder.AddChunk(nextStartIdx - startIdx, batch); + result.emplace_back(builder.Finish(), saver.Apply(batch)); + } + startIdx = nextStartIdx; + } + + return result; +} + +std::shared_ptr TSparsedArray::DoGetMaxScalar() const { + std::shared_ptr result; + for (auto&& i : Records) { + auto scalarCurrent = i.GetMaxScalar(); + if (!scalarCurrent) { + continue; + } + if (!result || ScalarCompare(result, scalarCurrent) < 0) { + result = scalarCurrent; + } + } + return result; +} + +ui32 TSparsedArray::GetLastIndex(const std::shared_ptr& batch) { + AFL_VERIFY(batch); + AFL_VERIFY(batch->num_rows()); + auto c = batch->GetColumnByName("index"); + AFL_VERIFY(c)("schema", batch->schema()->ToString()); + AFL_VERIFY(c->type_id() == arrow::uint32()->id())("type", c->type()->ToString()); + auto ui32Column = static_pointer_cast(c); + return ui32Column->Value(ui32Column->length() - 1); +} + +IChunkedArray::TCurrentChunkAddress TSparsedArrayChunk::GetChunk( + const std::optional& /*chunkCurrent*/, const ui64 position, const ui32 chunkIdx) const { + auto it = RemapExternalToInternal.upper_bound(position); + AFL_VERIFY(it != RemapExternalToInternal.begin()); + --it; + if (it->second.GetIsDefault()) { + return IChunkedArray::TCurrentChunkAddress( + NArrow::TThreadSimpleArraysCache::Get(ColValue->type(), DefaultValue, it->second.GetSize()), StartPosition + it->first, chunkIdx); + } else { + return IChunkedArray::TCurrentChunkAddress( + ColValue->Slice(it->second.GetStart(), it->second.GetSize()), StartPosition + it->first, chunkIdx); + } +} + +std::vector> TSparsedArrayChunk::GetChunkedArray() const { + std::vector> chunks; + for (auto&& i : RemapExternalToInternal) { + if (i.second.GetIsDefault()) { + chunks.emplace_back(NArrow::TThreadSimpleArraysCache::Get(ColValue->type(), DefaultValue, i.second.GetSize())); + } else { + chunks.emplace_back(ColValue->Slice(i.second.GetStart(), i.second.GetSize())); + } + } + return chunks; +} + +TSparsedArrayChunk::TSparsedArrayChunk(const ui32 posStart, const ui32 recordsCount, const std::shared_ptr& records, + const std::shared_ptr& defaultValue) + : RecordsCount(recordsCount) + , StartPosition(posStart) + , Records(records) + , DefaultValue(defaultValue) { + AFL_VERIFY(records->num_columns() == 2); + ColIndex = Records->GetColumnByName("index"); + AFL_VERIFY(ColIndex); + AFL_VERIFY(ColIndex->type_id() == arrow::uint32()->id()); + UI32ColIndex = static_pointer_cast(ColIndex); + if (UI32ColIndex->length()) { + AFL_VERIFY(UI32ColIndex->Value(UI32ColIndex->length() - 1) < recordsCount)("val", UI32ColIndex->Value(UI32ColIndex->length() - 1))( + "count", recordsCount); + } + NotDefaultRecordsCount = UI32ColIndex->length(); + RawValues = UI32ColIndex->raw_values(); + ColValue = Records->GetColumnByName("value"); + if (DefaultValue) { + AFL_VERIFY(DefaultValue->type->id() == ColValue->type_id()); + } + ui32 nextIndex = 0; + ui32 startIndexExt = 0; + ui32 startIndexInt = 0; + for (ui32 idx = 0; idx < UI32ColIndex->length(); ++idx) { + if (nextIndex != UI32ColIndex->Value(idx)) { + if (idx - startIndexInt) { + AFL_VERIFY(RemapExternalToInternal.emplace(startIndexExt, TInternalChunkInfo(startIndexInt, idx - startIndexInt, false)).second); + } + AFL_VERIFY(RemapExternalToInternal.emplace(nextIndex, TInternalChunkInfo(0, UI32ColIndex->Value(idx) - nextIndex, true)).second); + startIndexExt = UI32ColIndex->Value(idx); + startIndexInt = idx; + } + nextIndex = UI32ColIndex->Value(idx) + 1; + } + if (UI32ColIndex->length() > startIndexInt) { + AFL_VERIFY(RemapExternalToInternal.emplace(startIndexExt, TInternalChunkInfo(startIndexInt, UI32ColIndex->length() - startIndexInt, false)).second); + } + if (nextIndex != RecordsCount) { + AFL_VERIFY(RemapExternalToInternal.emplace(nextIndex, TInternalChunkInfo(0, RecordsCount - nextIndex, true)).second); + } + ui32 count = 0; + for (auto&& i : RemapExternalToInternal) { + count += i.second.GetSize(); + } + AFL_VERIFY(count == RecordsCount)("count", count)("records_count", RecordsCount); + AFL_VERIFY(ColValue); +} + +ui64 TSparsedArrayChunk::GetRawSize() const { + return std::max(NArrow::GetBatchDataSize(Records), 8); +} + +std::shared_ptr TSparsedArrayChunk::GetScalar(const ui32 index) const { + AFL_VERIFY(index < RecordsCount); + for (ui32 idx = 0; idx < UI32ColIndex->length(); ++idx) { + if (UI32ColIndex->Value(idx) == index) { + return NArrow::TStatusValidator::GetValid(ColValue->GetScalar(idx)); + } + } + return DefaultValue; +} + +ui32 TSparsedArrayChunk::GetFirstIndexNotDefault() const { + if (UI32ColIndex->length()) { + return StartPosition + GetUI32ColIndex()->Value(0); + } else { + return StartPosition + GetRecordsCount(); + } +} + +std::shared_ptr TSparsedArrayChunk::GetMaxScalar() const { + if (!ColValue->length()) { + return DefaultValue; + } + auto minMax = NArrow::FindMinMaxPosition(ColValue); + auto currentScalar = NArrow::TStatusValidator::GetValid(ColValue->GetScalar(minMax.second)); + if (!DefaultValue || ScalarCompare(DefaultValue, currentScalar) < 0) { + return currentScalar; + } + return DefaultValue; +} + +void TSparsedArray::TBuilder::AddChunk(const ui32 recordsCount, const std::shared_ptr& data) { + AFL_VERIFY(data); + AFL_VERIFY(recordsCount); + AFL_VERIFY(data->num_rows() <= recordsCount)("rows", data->num_rows())("count", recordsCount); + AFL_VERIFY(data->num_columns() == 2)("count", data->num_columns()); + AFL_VERIFY(data->column(0)->type_id() == arrow::uint32()->id())("type", data->column(0)->type()->ToString()); + AFL_VERIFY_DEBUG(data->schema()->field(0)->name() == "index")("name", data->schema()->field(0)->name()); + if (data->num_rows()) { + auto* arr = static_cast(data->column(0).get()); + AFL_VERIFY(arr->Value(arr->length() - 1) < recordsCount)("val", arr->Value(arr->length() - 1))("count", recordsCount); + } + Chunks.emplace_back(TSparsedArrayChunk(RecordsCount, recordsCount, data, DefaultValue)); + RecordsCount += recordsCount; +} + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/sparsed/accessor.h b/ydb/core/formats/arrow/accessor/sparsed/accessor.h new file mode 100644 index 000000000000..592f44a39de7 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/sparsed/accessor.h @@ -0,0 +1,171 @@ +#pragma once +#include + +#include + +#include +#include +#include + +namespace NKikimr::NArrow::NAccessor { + +class TSparsedArrayChunk { +private: + YDB_READONLY(ui32, RecordsCount, 0); + YDB_READONLY(ui32, StartPosition, 0); + YDB_READONLY_DEF(std::shared_ptr, Records); + std::shared_ptr DefaultValue; + + std::shared_ptr ColIndex; + const ui32* RawValues = nullptr; + ui32 NotDefaultRecordsCount = 0; + YDB_READONLY_DEF(std::shared_ptr, UI32ColIndex); + YDB_READONLY_DEF(std::shared_ptr, ColValue); + + class TInternalChunkInfo { + private: + YDB_READONLY(ui32, Start, 0); + YDB_READONLY(ui32, Size, 0); + YDB_READONLY(bool, IsDefault, false); + + public: + TInternalChunkInfo(const ui32 start, const ui32 size, const bool defaultFlag) + : Start(start) + , Size(size) + , IsDefault(defaultFlag) { + AFL_VERIFY(Size); + } + }; + + std::map RemapExternalToInternal; + +public: + ui32 GetFinishPosition() const { + return StartPosition + RecordsCount; + } + + ui32 GetNotDefaultRecordsCount() const { + return NotDefaultRecordsCount; + } + + ui32 GetIndexUnsafeFast(const ui32 i) const { + return RawValues[i]; + } + + ui32 GetFirstIndexNotDefault() const; + + std::shared_ptr GetMaxScalar() const; + + std::shared_ptr GetScalar(const ui32 index) const; + + IChunkedArray::TCurrentChunkAddress GetChunk( + const std::optional& chunkCurrent, const ui64 position, const ui32 chunkIdx) const; + + std::vector> GetChunkedArray() const; + + TSparsedArrayChunk(const ui32 posStart, const ui32 recordsCount, const std::shared_ptr& records, + const std::shared_ptr& defaultValue); + + ui64 GetRawSize() const; +}; + +class TSparsedArray: public IChunkedArray { +private: + using TBase = IChunkedArray; + std::shared_ptr DefaultValue; + std::vector Records; + +protected: + virtual TCurrentArrayAddress DoGetArray(const std::optional& /*chunkCurrent*/, const ui64 /*position*/, + const std::shared_ptr& selfPtr) const override { + return TCurrentArrayAddress(selfPtr, 0, 0); + } + + virtual std::shared_ptr DoGetMaxScalar() const override; + + virtual std::vector DoSplitBySizes( + const TColumnSaver& saver, const TString& fullSerializedData, const std::vector& splitSizes) override; + + virtual TCurrentChunkAddress DoGetChunk(const std::optional& chunkCurrent, const ui64 position) const override { + ui32 currentIdx = 0; + for (ui32 i = 0; i < Records.size(); ++i) { + if (currentIdx <= position && position < currentIdx + Records[i].GetRecordsCount()) { + return Records[i].GetChunk(chunkCurrent, position - currentIdx, i); + } + currentIdx += Records[i].GetRecordsCount(); + } + AFL_VERIFY(false); + return TCurrentChunkAddress(nullptr, 0, 0); + } + virtual std::shared_ptr DoGetChunkedArray() const override { + std::vector> chunks; + for (auto&& i : Records) { + auto chunksLocal = i.GetChunkedArray(); + chunks.insert(chunks.end(), chunksLocal.begin(), chunksLocal.end()); + } + return std::make_shared(chunks, GetDataType()); + } + virtual std::optional DoGetRawSize() const override { + ui64 bytes = 0; + for (auto&& i : Records) { + bytes += i.GetRawSize(); + } + return bytes; + } + + TSparsedArray(std::vector&& data, const std::shared_ptr& defaultValue, + const std::shared_ptr& type, const ui32 recordsCount) + : TBase(recordsCount, EType::SparsedArray, type) + , Records(std::move(data)) { + } + + static ui32 GetLastIndex(const std::shared_ptr& batch); + +public: + TSparsedArray(const IChunkedArray& defaultArray, const std::shared_ptr& defaultValue); + TSparsedArray(const std::shared_ptr& defaultValue, + const std::shared_ptr& type, const ui32 recordsCount) + : TSparsedArray({}, defaultValue, type, recordsCount) + { + + } + + virtual std::shared_ptr DoGetScalar(const ui32 index) const override { + auto chunk = GetSparsedChunk(index); + return chunk.GetScalar(index - chunk.GetStartPosition()); + } + + TSparsedArrayChunk GetSparsedChunk(const ui64 position) const { + ui32 currentIdx = 0; + for (ui32 i = 0; i < Records.size(); ++i) { + if (currentIdx <= position && position < currentIdx + Records[i].GetRecordsCount()) { + return Records[i]; + } + currentIdx += Records[i].GetRecordsCount(); + } + AFL_VERIFY(false); + return Records.back(); + } + + class TBuilder { + private: + ui32 RecordsCount = 0; + std::vector Chunks; + std::shared_ptr DefaultValue; + std::shared_ptr Type; + + public: + TBuilder(const std::shared_ptr& defaultValue, const std::shared_ptr& type) + : DefaultValue(defaultValue) + , Type(type) { + } + + void AddChunk(const ui32 recordsCount, const std::shared_ptr& data); + + std::shared_ptr Finish() { + return std::shared_ptr(new TSparsedArray(std::move(Chunks), DefaultValue, Type, RecordsCount)); + } + }; +}; + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/accessor/sparsed/constructor.cpp b/ydb/core/formats/arrow/accessor/sparsed/constructor.cpp new file mode 100644 index 000000000000..379f6473f884 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/sparsed/constructor.cpp @@ -0,0 +1,34 @@ +#include "accessor.h" +#include "constructor.h" + +namespace NKikimr::NArrow::NAccessor::NSparsed { + +std::shared_ptr TConstructor::DoGetExpectedSchema(const std::shared_ptr& resultColumn) const { + arrow::FieldVector fields = { std::make_shared("index", arrow::uint32()), + std::make_shared("value", resultColumn->type()) }; + return std::make_shared(fields); +} + +TConclusion> TConstructor::DoConstructDefault(const TChunkConstructionData& externalInfo) const { + return std::make_shared(externalInfo.GetDefaultValue(), externalInfo.GetColumnType(), externalInfo.GetRecordsCount()); +} + +TConclusion> TConstructor::DoConstruct( + const std::shared_ptr& originalData, const TChunkConstructionData& externalInfo) const { + AFL_VERIFY(originalData->num_columns() == 2)("count", originalData->num_columns())("schema", originalData->schema()->ToString()); + NArrow::NAccessor::TSparsedArray::TBuilder builder(externalInfo.GetDefaultValue(), externalInfo.GetColumnType()); + builder.AddChunk(externalInfo.GetRecordsCount(), originalData); + return builder.Finish(); +} + +NKikimrArrowAccessorProto::TConstructor TConstructor::DoSerializeToProto() const { + NKikimrArrowAccessorProto::TConstructor result; + *result.MutableSparsed() = {}; + return result; +} + +bool TConstructor::DoDeserializeFromProto(const NKikimrArrowAccessorProto::TConstructor& proto) { + return true; +} + +} // namespace NKikimr::NArrow::NAccessor::NSparsed diff --git a/ydb/core/formats/arrow/accessor/sparsed/constructor.h b/ydb/core/formats/arrow/accessor/sparsed/constructor.h new file mode 100644 index 000000000000..05743cb4b373 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/sparsed/constructor.h @@ -0,0 +1,28 @@ +#pragma once +#include +#include + +namespace NKikimr::NArrow::NAccessor::NSparsed { + +class TConstructor: public IConstructor { +public: + static TString GetClassNameStatic() { + return TGlobalConst::SparsedDataAccessorName; + } + +private: + static inline auto Registrator = TFactory::TRegistrator(GetClassNameStatic()); + virtual TConclusion> DoConstruct( + const std::shared_ptr& originalData, const TChunkConstructionData& externalInfo) const override; + virtual NKikimrArrowAccessorProto::TConstructor DoSerializeToProto() const override; + virtual bool DoDeserializeFromProto(const NKikimrArrowAccessorProto::TConstructor& proto) override; + virtual std::shared_ptr DoGetExpectedSchema(const std::shared_ptr& resultColumn) const override; + virtual TConclusion> DoConstructDefault(const TChunkConstructionData& externalInfo) const override; + +public: + virtual TString GetClassName() const override { + return GetClassNameStatic(); + } +}; + +} // namespace NKikimr::NArrow::NAccessor::NSparsed diff --git a/ydb/core/formats/arrow/accessor/sparsed/request.cpp b/ydb/core/formats/arrow/accessor/sparsed/request.cpp new file mode 100644 index 000000000000..d484341a95c0 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/sparsed/request.cpp @@ -0,0 +1,22 @@ +#include "request.h" +#include "constructor.h" + +namespace NKikimr::NArrow::NAccessor::NSparsed { + +NKikimrArrowAccessorProto::TRequestedConstructor TRequestedConstuctor::DoSerializeToProto() const { + return NKikimrArrowAccessorProto::TRequestedConstructor(); +} + +bool TRequestedConstuctor::DoDeserializeFromProto(const NKikimrArrowAccessorProto::TRequestedConstructor& /*proto*/) { + return true; +} + +NKikimr::TConclusionStatus TRequestedConstuctor::DoDeserializeFromRequest(NYql::TFeaturesExtractor& /*features*/) { + return TConclusionStatus::Success(); +} + +NKikimr::TConclusion TRequestedConstuctor::DoBuildConstructor() const { + return std::make_shared(); +} + +} diff --git a/ydb/core/formats/arrow/accessor/sparsed/request.h b/ydb/core/formats/arrow/accessor/sparsed/request.h new file mode 100644 index 000000000000..205949bca97a --- /dev/null +++ b/ydb/core/formats/arrow/accessor/sparsed/request.h @@ -0,0 +1,26 @@ +#pragma once +#include +#include + +namespace NKikimr::NArrow::NAccessor::NSparsed { + +class TRequestedConstuctor: public IRequestedConstructor { +public: + static TString GetClassNameStatic() { + return TGlobalConst::SparsedDataAccessorName; + } + +private: + static inline auto Registrator = TFactory::TRegistrator(GetClassNameStatic()); + virtual TConclusion DoBuildConstructor() const override; + virtual NKikimrArrowAccessorProto::TRequestedConstructor DoSerializeToProto() const override; + virtual bool DoDeserializeFromProto(const NKikimrArrowAccessorProto::TRequestedConstructor& proto) override; + virtual TConclusionStatus DoDeserializeFromRequest(NYql::TFeaturesExtractor& features) override; + +public: + virtual TString GetClassName() const override { + return GetClassNameStatic(); + } +}; + +} // namespace NKikimr::NArrow::NAccessor::NSparsed diff --git a/ydb/core/formats/arrow/accessor/sparsed/ya.make b/ydb/core/formats/arrow/accessor/sparsed/ya.make new file mode 100644 index 000000000000..c4916a29c36c --- /dev/null +++ b/ydb/core/formats/arrow/accessor/sparsed/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +PEERDIR( + ydb/core/formats/arrow/protos + ydb/core/formats/arrow/accessor/abstract +) + +SRCS( + GLOBAL constructor.cpp + GLOBAL request.cpp + accessor.cpp +) + +END() diff --git a/ydb/core/formats/arrow/accessor/ya.make b/ydb/core/formats/arrow/accessor/ya.make new file mode 100644 index 000000000000..8d9536da5157 --- /dev/null +++ b/ydb/core/formats/arrow/accessor/ya.make @@ -0,0 +1,11 @@ +LIBRARY() + +PEERDIR( + ydb/core/formats/arrow/accessor/abstract + ydb/core/formats/arrow/accessor/plain + ydb/core/formats/arrow/accessor/composite + ydb/core/formats/arrow/accessor/composite_serial + ydb/core/formats/arrow/accessor/sparsed +) + +END() diff --git a/ydb/core/formats/arrow/common/adapter.h b/ydb/core/formats/arrow/common/adapter.h index 1b368e38de50..0a32dc4eb885 100644 --- a/ydb/core/formats/arrow/common/adapter.h +++ b/ydb/core/formats/arrow/common/adapter.h @@ -1,18 +1,20 @@ #pragma once #include "container.h" -#include "accessor.h" #include "validation.h" +#include +#include + #include -#include #include +#include #include +#include +#include #include #include -#include -#include -#include +#include namespace NKikimr::NArrow::NAdapter { @@ -27,7 +29,8 @@ class TDataBuilderPolicy { using TColumn = arrow::Array; using TAccessor = NAccessor::TTrivialArray; - [[nodiscard]] static std::shared_ptr AddColumn(const std::shared_ptr& batch, const std::shared_ptr& field, const std::shared_ptr& extCol) { + [[nodiscard]] static std::shared_ptr AddColumn(const std::shared_ptr& batch, + const std::shared_ptr& field, const std::shared_ptr& extCol) { return TStatusValidator::GetValid(batch->AddColumn(batch->num_columns(), field, extCol)); } @@ -37,7 +40,8 @@ class TDataBuilderPolicy { [[nodiscard]] static std::shared_ptr Build(const std::shared_ptr& schema, std::vector>&& columns, const ui32 count) { return arrow::RecordBatch::Make(schema, count, std::move(columns)); } - [[nodiscard]] static std::shared_ptr ApplyArrowFilter(const std::shared_ptr& batch, const std::shared_ptr& filter) { + [[nodiscard]] static std::shared_ptr ApplyArrowFilter( + const std::shared_ptr& batch, const std::shared_ptr& filter) { auto res = arrow::compute::Filter(batch, filter); Y_VERIFY_S(res.ok(), res.status().message()); Y_ABORT_UNLESS(res->kind() == arrow::Datum::RECORD_BATCH); @@ -46,7 +50,6 @@ class TDataBuilderPolicy { [[nodiscard]] static std::shared_ptr GetEmptySame(const std::shared_ptr& batch) { return batch->Slice(0, 0); } - }; template <> @@ -60,11 +63,13 @@ class TDataBuilderPolicy { [[nodiscard]] static std::shared_ptr Build(const std::shared_ptr& schema, std::vector>&& columns, const ui32 count) { return arrow::Table::Make(schema, std::move(columns), count); } - [[nodiscard]] static std::shared_ptr AddColumn(const std::shared_ptr& batch, const std::shared_ptr& field, const std::shared_ptr& extCol) { + [[nodiscard]] static std::shared_ptr AddColumn( + const std::shared_ptr& batch, const std::shared_ptr& field, const std::shared_ptr& extCol) { return TStatusValidator::GetValid(batch->AddColumn(batch->num_columns(), field, std::make_shared(extCol))); } - [[nodiscard]] static std::shared_ptr ApplyArrowFilter(const std::shared_ptr& batch, const std::shared_ptr& filter) { + [[nodiscard]] static std::shared_ptr ApplyArrowFilter( + const std::shared_ptr& batch, const std::shared_ptr& filter) { auto res = arrow::compute::Filter(batch, filter); Y_VERIFY_S(res.ok(), res.status().message()); Y_ABORT_UNLESS(res->kind() == arrow::Datum::TABLE); @@ -86,11 +91,13 @@ class TDataBuilderPolicy { } return std::make_shared(std::make_shared(std::move(fields)), std::move(columns)); } - [[nodiscard]] static std::shared_ptr AddColumn(const std::shared_ptr& batch, const std::shared_ptr& field, const std::shared_ptr& extCol) { + [[nodiscard]] static std::shared_ptr AddColumn(const std::shared_ptr& batch, + const std::shared_ptr& field, const std::shared_ptr& extCol) { batch->AddField(field, std::make_shared(extCol)).Validate(); return batch; } - [[nodiscard]] static std::shared_ptr ApplyArrowFilter(const std::shared_ptr& batch, const std::shared_ptr& filter) { + [[nodiscard]] static std::shared_ptr ApplyArrowFilter( + const std::shared_ptr& batch, const std::shared_ptr& filter) { auto table = batch->BuildTableVerified(); return std::make_shared(TDataBuilderPolicy::ApplyArrowFilter(table, filter)); } @@ -99,4 +106,4 @@ class TDataBuilderPolicy { } }; -} +} // namespace NKikimr::NArrow::NAdapter diff --git a/ydb/core/formats/arrow/common/container.cpp b/ydb/core/formats/arrow/common/container.cpp index ad0215737bc8..7e8209dfad51 100644 --- a/ydb/core/formats/arrow/common/container.cpp +++ b/ydb/core/formats/arrow/common/container.cpp @@ -1,8 +1,11 @@ #include "container.h" -#include + +#include #include #include +#include + namespace NKikimr::NArrow { TConclusionStatus TGeneralContainer::MergeColumnsStrictly(const TGeneralContainer& container) { @@ -13,8 +16,9 @@ TConclusionStatus TGeneralContainer::MergeColumnsStrictly(const TGeneralContaine RecordsCount = container.RecordsCount; } if (*RecordsCount != *container.RecordsCount) { - return TConclusionStatus::Fail(TStringBuilder() << "inconsistency records count in additional container: " << - container.GetSchema()->ToString() << ". expected: " << RecordsCount << ", reality: " << container.GetRecordsCount()); + return TConclusionStatus::Fail(TStringBuilder() + << "inconsistency records count in additional container: " << container.GetSchema()->ToString() + << ". expected: " << RecordsCount << ", reality: " << container.GetRecordsCount()); } for (i32 i = 0; i < container.Schema->num_fields(); ++i) { auto addFieldResult = AddField(container.Schema->field(i), container.Columns[i]); @@ -29,11 +33,12 @@ TConclusionStatus TGeneralContainer::AddField(const std::shared_ptrGetRecordsCount() != *RecordsCount) { - return TConclusionStatus::Fail(TStringBuilder() << "inconsistency records count in new column: " << - f->name() << ". expected: " << RecordsCount << ", reality: " << data->GetRecordsCount()); + return TConclusionStatus::Fail(TStringBuilder() << "inconsistency records count in new column: " << f->name() + << ". expected: " << RecordsCount << ", reality: " << data->GetRecordsCount()); } if (!data->GetDataType()->Equals(f->type())) { - return TConclusionStatus::Fail("schema and data type are not equals: " + data->GetDataType()->ToString() + " vs " + f->type()->ToString()); + return TConclusionStatus::Fail( + "schema and data type are not equals: " + data->GetDataType()->ToString() + " vs " + f->type()->ToString()); } { auto conclusion = Schema->AddField(f); @@ -64,7 +69,8 @@ void TGeneralContainer::Initialize() { recordsCount = Columns[i]->GetRecordsCount(); } else { AFL_VERIFY(*recordsCount == Columns[i]->GetRecordsCount()) - ("event", "inconsistency_records_count")("expect", *recordsCount)("real", Columns[i]->GetRecordsCount())("field_name", Schema->field(i)->name()); + ("event", "inconsistency_records_count")("expect", *recordsCount)("real", Columns[i]->GetRecordsCount())( + "field_name", Schema->field(i)->name()); } } AFL_VERIFY(recordsCount); @@ -72,24 +78,24 @@ void TGeneralContainer::Initialize() { RecordsCount = *recordsCount; } -TGeneralContainer::TGeneralContainer(const std::vector>& fields, std::vector>&& columns) +TGeneralContainer::TGeneralContainer( + const std::vector>& fields, std::vector>&& columns) : Schema(std::make_shared(fields)) - , Columns(std::move(columns)) -{ + , Columns(std::move(columns)) { Initialize(); } -TGeneralContainer::TGeneralContainer(const std::shared_ptr& schema, std::vector>&& columns) +TGeneralContainer::TGeneralContainer( + const std::shared_ptr& schema, std::vector>&& columns) : Schema(std::make_shared(schema)) - , Columns(std::move(columns)) -{ + , Columns(std::move(columns)) { Initialize(); } -TGeneralContainer::TGeneralContainer(const std::shared_ptr& schema, std::vector>&& columns) +TGeneralContainer::TGeneralContainer( + const std::shared_ptr& schema, std::vector>&& columns) : Schema(std::make_shared(schema)) - , Columns(std::move(columns)) -{ + , Columns(std::move(columns)) { Initialize(); } @@ -164,7 +170,8 @@ std::shared_ptr TGeneralContainer::GetAccessor return Columns[idx]; } -TConclusionStatus TGeneralContainer::SyncSchemaTo(const std::shared_ptr& schema, const IFieldsConstructor* defaultFieldsConstructor, const bool forceDefaults) { +TConclusionStatus TGeneralContainer::SyncSchemaTo( + const std::shared_ptr& schema, const IFieldsConstructor* defaultFieldsConstructor, const bool forceDefaults) { std::shared_ptr schemaNew = std::make_shared(); std::vector> columnsNew; if (!RecordsCount) { @@ -181,12 +188,14 @@ TConclusionStatus TGeneralContainer::SyncSchemaTo(const std::shared_ptr(NArrow::TThreadSimpleArraysCache::Get(i->type(), *defConclusion, *RecordsCount))); + columnsNew.emplace_back( + std::make_shared(NArrow::TThreadSimpleArraysCache::Get(i->type(), *defConclusion, *RecordsCount))); } } else { const auto& fOwned = Schema->GetFieldVerified(idx); if (!fOwned->type()->Equals(i->type())) { - return TConclusionStatus::Fail("different field types for '" + i->name() + "'. Have " + fOwned->type()->ToString() + ", need " + i->type()->ToString()); + return TConclusionStatus::Fail( + "different field types for '" + i->name() + "'. Have " + fOwned->type()->ToString() + ", need " + i->type()->ToString()); } schemaNew->AddField(fOwned).Validate(); columnsNew.emplace_back(Columns[idx]); @@ -206,7 +215,8 @@ TString TGeneralContainer::DebugString() const { return result; } -TConclusion> IFieldsConstructor::GetDefaultColumnElementValue(const std::shared_ptr& field, const bool force) const { +TConclusion> IFieldsConstructor::GetDefaultColumnElementValue( + const std::shared_ptr& field, const bool force) const { AFL_VERIFY(field); auto result = DoGetDefaultColumnElementValue(field->name()); if (result) { @@ -218,4 +228,4 @@ TConclusion> IFieldsConstructor::GetDefaultColumn return TConclusionStatus::Fail("have not default value for column " + field->name()); } -} +} // namespace NKikimr::NArrow diff --git a/ydb/core/formats/arrow/common/container.h b/ydb/core/formats/arrow/common/container.h index b92871b96c6a..90d686ce4e44 100644 --- a/ydb/core/formats/arrow/common/container.h +++ b/ydb/core/formats/arrow/common/container.h @@ -1,5 +1,5 @@ #pragma once -#include "accessor.h" +#include #include diff --git a/ydb/core/formats/arrow/common/validation.h b/ydb/core/formats/arrow/common/validation.h index f71f18ece59c..344128547d7c 100644 --- a/ydb/core/formats/arrow/common/validation.h +++ b/ydb/core/formats/arrow/common/validation.h @@ -1,26 +1,3 @@ #pragma once -#include -#include -#include - -namespace NKikimr::NArrow { - -class TStatusValidator { -public: - static void Validate(const arrow::Status& status); - - template - static T GetValid(const arrow::Result& result) { - Validate(result.status()); - return *result; - } - - template - static T GetValid(arrow::Result&& result) { - Validate(result.status()); - return std::move(*result); - } -}; - -} +#include diff --git a/ydb/core/formats/arrow/common/ya.make b/ydb/core/formats/arrow/common/ya.make index 61f742b09b76..76f8805b572f 100644 --- a/ydb/core/formats/arrow/common/ya.make +++ b/ydb/core/formats/arrow/common/ya.make @@ -5,13 +5,13 @@ PEERDIR( ydb/core/formats/arrow/switch ydb/library/actors/core ydb/library/conclusion + ydb/core/formats/arrow/splitter + ydb/core/formats/arrow/validation ) SRCS( container.cpp - validation.cpp adapter.cpp - accessor.cpp ) END() diff --git a/ydb/core/formats/arrow/protos/accessor.proto b/ydb/core/formats/arrow/protos/accessor.proto new file mode 100644 index 000000000000..015ea0b7cf89 --- /dev/null +++ b/ydb/core/formats/arrow/protos/accessor.proto @@ -0,0 +1,30 @@ +package NKikimrArrowAccessorProto; + +message TRequestedConstructor { + optional string ClassName = 1; + + message TPlain { + } + + message TSparsed { + } + + oneof Implementation { + TPlain Plain = 10; + TSparsed Sparsed = 11; + } +} + +message TConstructor { + optional string ClassName = 1; + + message TPlain { + } + + message TSparsed { + } + oneof Implementation { + TPlain Plain = 10; + TSparsed Sparsed = 11; + } +} \ No newline at end of file diff --git a/ydb/core/formats/arrow/protos/ya.make b/ydb/core/formats/arrow/protos/ya.make index 828b0aa0fb77..6fbf466a1771 100644 --- a/ydb/core/formats/arrow/protos/ya.make +++ b/ydb/core/formats/arrow/protos/ya.make @@ -3,6 +3,7 @@ PROTO_LIBRARY() SRCS( ssa.proto fields.proto + accessor.proto ) PEERDIR( diff --git a/ydb/core/formats/arrow/reader/position.cpp b/ydb/core/formats/arrow/reader/position.cpp index 6431d180d130..d26323a938ac 100644 --- a/ydb/core/formats/arrow/reader/position.cpp +++ b/ydb/core/formats/arrow/reader/position.cpp @@ -1,4 +1,7 @@ #include "position.h" + +#include + #include namespace NKikimr::NArrow::NMerger { @@ -15,7 +18,8 @@ NJson::TJsonValue TSortableBatchPosition::DebugJson() const { return result; } -std::optional TSortableBatchPosition::FindPosition(TRWSortableBatchPosition& position, const ui64 posStartExt, const ui64 posFinishExt, const TSortableBatchPosition& forFound, const bool greater) { +std::optional TSortableBatchPosition::FindPosition(TRWSortableBatchPosition& position, + const ui64 posStartExt, const ui64 posFinishExt, const TSortableBatchPosition& forFound, const bool greater) { ui64 posStart = posStartExt; ui64 posFinish = posFinishExt; { @@ -57,7 +61,8 @@ std::optional TSortableBatchPosition::Fi } } -std::optional TSortableBatchPosition::FindPosition(const std::shared_ptr& batch, const TSortableBatchPosition& forFound, const bool greater, const std::optional includedStartPosition) { +std::optional TSortableBatchPosition::FindPosition(const std::shared_ptr& batch, + const TSortableBatchPosition& forFound, const bool greater, const std::optional includedStartPosition) { if (!batch || !batch->num_rows()) { return {}; } @@ -75,10 +80,12 @@ std::optional TSortableBatchPosition::Fi } NKikimr::NArrow::NMerger::TRWSortableBatchPosition TSortableBatchPosition::BuildRWPosition() const { - return TRWSortableBatchPosition(Position, RecordsCount, ReverseSort, Sorting->BuildCopy(Position), Data ? Data->BuildCopy(Position) : nullptr); + return TRWSortableBatchPosition( + Position, RecordsCount, ReverseSort, Sorting->BuildCopy(Position), Data ? Data->BuildCopy(Position) : nullptr); } -NKikimr::NArrow::NMerger::TRWSortableBatchPosition TSortableBatchPosition::BuildRWPosition(std::shared_ptr batch, const ui32 position) const { +NKikimr::NArrow::NMerger::TRWSortableBatchPosition TSortableBatchPosition::BuildRWPosition( + std::shared_ptr batch, const ui32 position) const { std::vector dataColumns; if (Data) { dataColumns = Data->GetFieldNames(); @@ -98,7 +105,8 @@ TSortableBatchPosition::TFoundPosition TRWSortableBatchPosition::SkipToLower(con return *pos; } -TSortableScanData::TSortableScanData(const ui64 position, const std::shared_ptr& batch, const std::vector& columns) { +TSortableScanData::TSortableScanData( + const ui64 position, const std::shared_ptr& batch, const std::vector& columns) { for (auto&& i : columns) { auto c = batch->GetAccessorByNameOptional(i); AFL_VERIFY(c)("column_name", i)("columns", JoinSeq(",", columns))("batch", batch->DebugString()); @@ -110,7 +118,8 @@ TSortableScanData::TSortableScanData(const ui64 position, const std::shared_ptr< BuildPosition(position); } -TSortableScanData::TSortableScanData(const ui64 position, const std::shared_ptr& batch, const std::vector& columns) { +TSortableScanData::TSortableScanData( + const ui64 position, const std::shared_ptr& batch, const std::vector& columns) { for (auto&& i : columns) { auto c = batch->GetColumnByName(i); AFL_VERIFY(c)("column_name", i)("columns", JoinSeq(",", columns)); @@ -134,9 +143,12 @@ TSortableScanData::TSortableScanData(const ui64 position, const std::shared_ptr< BuildPosition(position); } -void TSortableScanData::AppendPositionTo(const std::vector>& builders, const ui64 position, ui64* recordSize) const { +void TSortableScanData::AppendPositionTo( + const std::vector>& builders, const ui64 position, ui64* recordSize) const { AFL_VERIFY(builders.size() == PositionAddress.size()); for (ui32 i = 0; i < PositionAddress.size(); ++i) { + AFL_VERIFY(PositionAddress[i].GetStartPosition() <= position)("pos", position)("start", PositionAddress[i].GetStartPosition()); + AFL_VERIFY(position < PositionAddress[i].GetFinishPosition())("pos", position)("finish", PositionAddress[i].GetFinishPosition()); AFL_VERIFY(NArrow::Append(*builders[i], *PositionAddress[i].GetArray(), position - PositionAddress[i].GetStartPosition(), recordSize)); } } @@ -178,6 +190,8 @@ bool TSortableScanData::InitPosition(const ui64 position) { } StartPosition = std::max(StartPosition, i.GetStartPosition()); FinishPosition = std::min(FinishPosition, i.GetFinishPosition()); + AFL_VERIFY(position < i.GetFinishPosition()); + AFL_VERIFY(i.GetStartPosition() <= position); ++idx; } AFL_VERIFY(StartPosition < FinishPosition); @@ -217,9 +231,8 @@ void TCursor::AppendPositionTo(const std::vector& table, const ui64 position, const std::vector& columns) - : Position(position) -{ + : Position(position) { PositionAddress = TSortableScanData(position, table, columns).GetPositionAddress(); } -} +} // namespace NKikimr::NArrow::NMerger diff --git a/ydb/core/formats/arrow/reader/position.h b/ydb/core/formats/arrow/reader/position.h index 5b8fd35d5bc7..440f12fac5de 100644 --- a/ydb/core/formats/arrow/reader/position.h +++ b/ydb/core/formats/arrow/reader/position.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include #include diff --git a/ydb/core/formats/arrow/save_load/loader.cpp b/ydb/core/formats/arrow/save_load/loader.cpp new file mode 100644 index 000000000000..d6f200b35c05 --- /dev/null +++ b/ydb/core/formats/arrow/save_load/loader.cpp @@ -0,0 +1,68 @@ +#include "loader.h" + +#include + +namespace NKikimr::NArrow::NAccessor { + +TString TColumnLoader::DebugString() const { + TStringBuilder result; + result << "accessor_constructor:" << AccessorConstructor->DebugString() << ";"; + result << "result_field:" << ResultField->ToString() << ";"; + if (Transformer) { + result << "transformer:" << Transformer->DebugString() << ";"; + } + result << "serializer:" << Serializer->DebugString() << ";"; + return result; +} + +TColumnLoader::TColumnLoader(NTransformation::ITransformer::TPtr transformer, const NSerialization::TSerializerContainer& serializer, + const TConstructorContainer& accessorConstructor, const std::shared_ptr& resultField, + const std::shared_ptr& defaultValue, const ui32 columnId) + : Serializer(serializer) + , Transformer(transformer) + , AccessorConstructor(accessorConstructor) + , ResultField(resultField) + , DefaultValue(defaultValue) + , ColumnId(columnId) { + AFL_VERIFY(!!AccessorConstructor); + AFL_VERIFY(ResultField); + AFL_VERIFY(Serializer); +} + +const std::shared_ptr& TColumnLoader::GetField() const { + return ResultField; +} + +arrow::Result> TColumnLoader::Apply(const TString& data) const { + Y_ABORT_UNLESS(Serializer); + arrow::Result> columnArray = + Transformer ? Serializer->Deserialize(data) : Serializer->Deserialize(data, AccessorConstructor->GetExpectedSchema(ResultField)); + if (!columnArray.ok()) { + return columnArray; + } + if (Transformer) { + return Transformer->Transform(*columnArray); + } else { + return columnArray; + } +} + +std::shared_ptr TColumnLoader::ApplyRawVerified(const TString& data) const { + return TStatusValidator::GetValid(Apply(data)); +} + +std::shared_ptr TColumnLoader::ApplyVerified(const TString& dataStr, const ui32 recordsCount) const { + auto data = TStatusValidator::GetValid(Apply(dataStr)); + return BuildAccessor(data, TChunkConstructionData(recordsCount, DefaultValue, ResultField->type())); +} + +std::shared_ptr TColumnLoader::BuildAccessor( + const std::shared_ptr& batch, const TChunkConstructionData& chunkData) const { + return AccessorConstructor->Construct(batch, chunkData).DetachResult(); +} + +std::shared_ptr TColumnLoader::BuildDefaultAccessor(const ui32 recordsCount) const { + return AccessorConstructor->ConstructDefault(TChunkConstructionData(recordsCount, DefaultValue, ResultField->type())).DetachResult(); +} + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/save_load/loader.h b/ydb/core/formats/arrow/save_load/loader.h new file mode 100644 index 000000000000..1b42e41fc106 --- /dev/null +++ b/ydb/core/formats/arrow/save_load/loader.h @@ -0,0 +1,56 @@ +#pragma once +#include +#include +#include + +#include + +#include + +namespace NKikimr::NArrow::NAccessor { + +class TColumnLoader { +private: + NSerialization::TSerializerContainer Serializer; + NTransformation::ITransformer::TPtr Transformer; + YDB_READONLY_DEF(NAccessor::TConstructorContainer, AccessorConstructor); + YDB_READONLY_DEF(std::shared_ptr, ResultField); + YDB_READONLY_DEF(std::shared_ptr, DefaultValue); + const ui32 ColumnId; + + arrow::Result> Apply(const TString& data) const; + std::shared_ptr BuildAccessor( + const std::shared_ptr& batch, const TChunkConstructionData& chunkData) const; + +public: + std::shared_ptr BuildDefaultAccessor(const ui32 recordsCount) const; + + bool IsEqualTo(const TColumnLoader& item) const { + if (!!Transformer != !!item.Transformer) { + return false; + } else if (!!Transformer && !Transformer->IsEqualTo(*item.Transformer)) { + return false; + } + if (!Serializer.IsEqualTo(item.Serializer)) { + return false; + } + return true; + } + + TString DebugString() const; + + TColumnLoader(NTransformation::ITransformer::TPtr transformer, const NSerialization::TSerializerContainer& serializer, + const NAccessor::TConstructorContainer& accessorConstructor, const std::shared_ptr& resultField, + const std::shared_ptr& defaultValue, const ui32 columnId); + + ui32 GetColumnId() const { + return ColumnId; + } + + const std::shared_ptr& GetField() const; + + std::shared_ptr ApplyVerified(const TString& data, const ui32 expectedRecordsCount) const; + std::shared_ptr ApplyRawVerified(const TString& data) const; +}; + +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/tx/columnshard/engines/scheme/abstract/saver.cpp b/ydb/core/formats/arrow/save_load/saver.cpp similarity index 63% rename from ydb/core/tx/columnshard/engines/scheme/abstract/saver.cpp rename to ydb/core/formats/arrow/save_load/saver.cpp index c15db92b8eec..95adebc76471 100644 --- a/ydb/core/tx/columnshard/engines/scheme/abstract/saver.cpp +++ b/ydb/core/formats/arrow/save_load/saver.cpp @@ -1,6 +1,6 @@ #include "saver.h" -namespace NKikimr::NOlap { +namespace NKikimr::NArrow::NAccessor { TColumnSaver::TColumnSaver(NArrow::NTransformation::ITransformer::TPtr transformer, const NArrow::NSerialization::TSerializerContainer serializer) : Transformer(transformer) @@ -21,10 +21,17 @@ TString TColumnSaver::Apply(std::shared_ptr data, std::shared_ptr< TString TColumnSaver::Apply(const std::shared_ptr& data) const { Y_ABORT_UNLESS(Serializer); + NArrow::NSerialization::TSerializerContainer serializer = Serializer; + if (SerializerBySizeUpperBorder.size()) { + auto it = SerializerBySizeUpperBorder.lower_bound(data->num_rows()); + if (it != SerializerBySizeUpperBorder.end()) { + serializer = it->second; + } + } if (Transformer) { - return Serializer->SerializeFull(Transformer->Transform(data)); + return serializer->SerializeFull(Transformer->Transform(data)); } else { - return Serializer->SerializePayload(data); + return serializer->SerializePayload(data); } } diff --git a/ydb/core/tx/columnshard/engines/scheme/abstract/saver.h b/ydb/core/formats/arrow/save_load/saver.h similarity index 59% rename from ydb/core/tx/columnshard/engines/scheme/abstract/saver.h rename to ydb/core/formats/arrow/save_load/saver.h index c4d10c55a359..3532a0195fa3 100644 --- a/ydb/core/tx/columnshard/engines/scheme/abstract/saver.h +++ b/ydb/core/formats/arrow/save_load/saver.h @@ -1,25 +1,30 @@ #pragma once -#include #include +#include + +#include + #include #include -namespace NKikimr::NOlap { +namespace NKikimr::NArrow::NAccessor { class TColumnSaver { private: NArrow::NTransformation::ITransformer::TPtr Transformer; - NArrow::NSerialization::TSerializerContainer Serializer; + YDB_READONLY_DEF(NArrow::NSerialization::TSerializerContainer, Serializer); + std::map SerializerBySizeUpperBorder; + public: TColumnSaver() = default; TColumnSaver(NArrow::NTransformation::ITransformer::TPtr transformer, const NArrow::NSerialization::TSerializerContainer serializer); - void ResetSerializer(const NArrow::NSerialization::TSerializerContainer& serializer) { - AFL_VERIFY(serializer); + void AddSerializerWithBorder(const ui32 upperBorder, const NArrow::NSerialization::TSerializerContainer& serializer) { if (Serializer.IsCompatibleForExchange(serializer)) { - Serializer = serializer; + AFL_VERIFY(SerializerBySizeUpperBorder.emplace(upperBorder, serializer).second); } else { - AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("event", "cannot_reset_serializer")("reason", "incompatible_serializers"); + AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("event", "cannot_add_serializer")("reason", "incompatible_serializers")( + "border", upperBorder); } } @@ -30,5 +35,4 @@ class TColumnSaver { TString Apply(const std::shared_ptr& data) const; }; - -} \ No newline at end of file +} // namespace NKikimr::NArrow::NAccessor diff --git a/ydb/core/formats/arrow/save_load/ya.make b/ydb/core/formats/arrow/save_load/ya.make new file mode 100644 index 000000000000..db2d6667519a --- /dev/null +++ b/ydb/core/formats/arrow/save_load/ya.make @@ -0,0 +1,17 @@ +LIBRARY() + +SRCS( + saver.cpp + loader.cpp +) + +PEERDIR( + ydb/library/actors/core + contrib/libs/apache/arrow + ydb/library/accessor + ydb/library/conclusion + ydb/core/formats/arrow/transformer + ydb/core/formats/arrow/serializer +) + +END() diff --git a/ydb/core/formats/arrow/serializer/ya.make b/ydb/core/formats/arrow/serializer/ya.make index bf7e091ab4bf..79a3ae1a3ddf 100644 --- a/ydb/core/formats/arrow/serializer/ya.make +++ b/ydb/core/formats/arrow/serializer/ya.make @@ -2,7 +2,7 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow - ydb/core/formats/arrow/common + ydb/core/formats/arrow/validation ydb/services/metadata/abstract ydb/library/actors/core ydb/core/protos diff --git a/ydb/core/formats/arrow/simple_arrays_cache.h b/ydb/core/formats/arrow/simple_arrays_cache.h index e527e44a0b08..2d307171658f 100644 --- a/ydb/core/formats/arrow/simple_arrays_cache.h +++ b/ydb/core/formats/arrow/simple_arrays_cache.h @@ -36,10 +36,14 @@ class TThreadSimpleArraysCache { } std::shared_ptr GetNullImpl(const std::shared_ptr& type, const ui32 recordsCount); - std::shared_ptr GetConstImpl(const std::shared_ptr& type, const std::shared_ptr& scalar, const ui32 recordsCount); + std::shared_ptr GetConstImpl( + const std::shared_ptr& type, const std::shared_ptr& scalar, const ui32 recordsCount); + public: static std::shared_ptr GetNull(const std::shared_ptr& type, const ui32 recordsCount); - static std::shared_ptr GetConst(const std::shared_ptr& type, const std::shared_ptr& scalar, const ui32 recordsCount); - static std::shared_ptr Get(const std::shared_ptr& type, const std::shared_ptr& scalar, const ui32 recordsCount); + static std::shared_ptr GetConst( + const std::shared_ptr& type, const std::shared_ptr& scalar, const ui32 recordsCount); + static std::shared_ptr Get( + const std::shared_ptr& type, const std::shared_ptr& scalar, const ui32 recordsCount); }; -} +} // namespace NKikimr::NArrow diff --git a/ydb/core/formats/arrow/simple_builder/filler.cpp b/ydb/core/formats/arrow/simple_builder/filler.cpp index f6168701ddbe..337941bebe42 100644 --- a/ydb/core/formats/arrow/simple_builder/filler.cpp +++ b/ydb/core/formats/arrow/simple_builder/filler.cpp @@ -1,11 +1,17 @@ #include "filler.h" + #include +#include namespace NKikimr::NArrow::NConstruction { -TStringPoolFiller::TStringPoolFiller(const ui32 poolSize, const ui32 strLen) { +TStringPoolFiller::TStringPoolFiller(const ui32 poolSize, const ui32 strLen, const TString& defaultValue, const double defaultValueFrq) { for (ui32 i = 0; i < poolSize; ++i) { - Data.emplace_back(NUnitTest::RandomString(strLen, i)); + if (RandomNumber() < defaultValueFrq) { + Data.emplace_back(defaultValue); + } else { + Data.emplace_back(NUnitTest::RandomString(strLen, i)); + } } } @@ -14,4 +20,4 @@ arrow::util::string_view TStringPoolFiller::GetValue(const ui32 idx) const { return arrow::util::string_view(str.data(), str.size()); } -} +} // namespace NKikimr::NArrow::NConstruction diff --git a/ydb/core/formats/arrow/simple_builder/filler.h b/ydb/core/formats/arrow/simple_builder/filler.h index e86e7a6c2139..2a773d0db4f9 100644 --- a/ydb/core/formats/arrow/simple_builder/filler.h +++ b/ydb/core/formats/arrow/simple_builder/filler.h @@ -1,9 +1,9 @@ #pragma once #include -#include #include -#include +#include #include +#include namespace NKikimr::NArrow::NConstruction { @@ -11,16 +11,17 @@ template class TIntSeqFiller { public: using TValue = TArrowInt; + private: using CType = typename TArrowInt::c_type; const CType Delta; + public: CType GetValue(const CType idx) const { return Delta + idx; } TIntSeqFiller(const CType delta = 0) : Delta(delta) { - } }; @@ -28,27 +29,29 @@ template class TIntConstFiller { public: using TValue = TArrowInt; + private: using CType = typename TArrowInt::c_type; const CType Value; + public: CType GetValue(const CType /*idx*/) const { return Value; } TIntConstFiller(const CType value) : Value(value) { - } }; class TStringPoolFiller { private: std::vector Data; + public: using TValue = arrow::StringType; arrow::util::string_view GetValue(const ui32 idx) const; - TStringPoolFiller(const ui32 poolSize, const ui32 strLen); + TStringPoolFiller(const ui32 poolSize, const ui32 strLen, const TString& defaultValue = "", const double defaultValueFrq = 0); }; template @@ -56,6 +59,7 @@ class TLinearArrayAccessor { private: using TArray = typename arrow::TypeTraits::ArrayType; const TArray& Data; + public: using TValue = TValueExt; auto GetValue(const ui32 idx) const { @@ -72,6 +76,7 @@ class TBinaryArrayAccessor { private: using TArray = typename arrow::TypeTraits::ArrayType; const TArray& Data; + public: using TValue = TValueExt; const char* GetValueView(const ui32 idx) const { @@ -89,6 +94,7 @@ class TDictionaryArrayAccessor { using TDictionary = typename arrow::TypeTraits::ArrayType; const TDictionary& Dictionary; const TIndices& Indices; + public: using TValue = TDictionaryValue; auto GetValue(const ui32 idx) const { @@ -108,6 +114,7 @@ class TBinaryDictionaryArrayAccessor { const TDictionary& Dictionary; const TIndices& Indices; std::vector DictionaryStrings; + public: using TValue = TDictionaryValue; const char* GetValueView(const ui32 idx) const { @@ -116,8 +123,7 @@ class TBinaryDictionaryArrayAccessor { TBinaryDictionaryArrayAccessor(const TDictionary& dictionary, const TIndices& indices) : Dictionary(dictionary) - , Indices(indices) - { + , Indices(indices) { DictionaryStrings.reserve(Dictionary.length()); for (i64 idx = 0; idx < Dictionary.length(); ++idx) { auto sView = Dictionary.Value(idx); @@ -126,4 +132,4 @@ class TBinaryDictionaryArrayAccessor { } }; -} +} // namespace NKikimr::NArrow::NConstruction diff --git a/ydb/core/formats/arrow/splitter/scheme_info.cpp b/ydb/core/formats/arrow/splitter/scheme_info.cpp new file mode 100644 index 000000000000..35a8fcc5c035 --- /dev/null +++ b/ydb/core/formats/arrow/splitter/scheme_info.cpp @@ -0,0 +1,13 @@ +#include "scheme_info.h" + +namespace NKikimr::NArrow::NSplitter { + +NAccessor::TColumnSaver ISchemaDetailInfo::GetColumnSaver(const ui32 columnId) const { + auto saver = DoGetColumnSaver(columnId); + if (OverrideSerializer) { + saver.AddSerializerWithBorder(Max(), *OverrideSerializer); + } + return saver; +} + +} diff --git a/ydb/core/tx/columnshard/splitter/scheme_info.h b/ydb/core/formats/arrow/splitter/scheme_info.h similarity index 64% rename from ydb/core/tx/columnshard/splitter/scheme_info.h rename to ydb/core/formats/arrow/splitter/scheme_info.h index 1e72e63e9d35..400bdfcc7862 100644 --- a/ydb/core/tx/columnshard/splitter/scheme_info.h +++ b/ydb/core/formats/arrow/splitter/scheme_info.h @@ -1,26 +1,29 @@ #pragma once #include "stats.h" -#include +#include +#include #include -namespace NKikimr::NOlap { +namespace NKikimr::NArrow::NSplitter { class ISchemaDetailInfo { private: - YDB_ACCESSOR_DEF(std::optional, OverrideSerializer); + YDB_ACCESSOR_DEF(std::optional, OverrideSerializer); + protected: - virtual TColumnSaver DoGetColumnSaver(const ui32 columnId) const = 0; + virtual NAccessor::TColumnSaver DoGetColumnSaver(const ui32 columnId) const = 0; + public: using TPtr = std::shared_ptr; virtual ~ISchemaDetailInfo() = default; virtual ui32 GetColumnId(const std::string& fieldName) const = 0; - TColumnSaver GetColumnSaver(const ui32 columnId) const; + NAccessor::TColumnSaver GetColumnSaver(const ui32 columnId) const; virtual std::shared_ptr GetField(const ui32 columnId) const = 0; virtual std::optional GetColumnSerializationStats(const ui32 columnId) const = 0; virtual bool NeedMinMaxForColumn(const ui32 columnId) const = 0; virtual bool IsSortedColumn(const ui32 columnId) const = 0; virtual std::optional GetBatchSerializationStats(const std::shared_ptr& rb) const = 0; }; -} +} // namespace NKikimr::NArrow::NSplitter diff --git a/ydb/core/formats/arrow/splitter/similar_packer.cpp b/ydb/core/formats/arrow/splitter/similar_packer.cpp new file mode 100644 index 000000000000..94395e18c3ee --- /dev/null +++ b/ydb/core/formats/arrow/splitter/similar_packer.cpp @@ -0,0 +1,5 @@ +#include "similar_packer.h" + +namespace NKikimr::NArrow::NSplitter { + +} diff --git a/ydb/core/tx/columnshard/splitter/similar_packer.h b/ydb/core/formats/arrow/splitter/similar_packer.h similarity index 98% rename from ydb/core/tx/columnshard/splitter/similar_packer.h rename to ydb/core/formats/arrow/splitter/similar_packer.h index 54abde2640fb..1fdfdf3e7a83 100644 --- a/ydb/core/tx/columnshard/splitter/similar_packer.h +++ b/ydb/core/formats/arrow/splitter/similar_packer.h @@ -3,7 +3,7 @@ #include -namespace NKikimr::NOlap { +namespace NKikimr::NArrow::NSplitter { template class TArrayView { diff --git a/ydb/core/tx/columnshard/splitter/simple.cpp b/ydb/core/formats/arrow/splitter/simple.cpp similarity index 78% rename from ydb/core/tx/columnshard/splitter/simple.cpp rename to ydb/core/formats/arrow/splitter/simple.cpp index 0af14f0ff9b4..8af1ca704212 100644 --- a/ydb/core/tx/columnshard/splitter/simple.cpp +++ b/ydb/core/formats/arrow/splitter/simple.cpp @@ -1,27 +1,11 @@ #include "simple.h" -#include + #include #include -#include - -namespace NKikimr::NOlap { - -std::vector> TSplittedColumnChunk::DoInternalSplitImpl(const TColumnSaver& saver, const std::shared_ptr& counters, const std::vector& splitSizes) const { - auto chunks = TSimpleSplitter(saver, counters).SplitBySizes(Data.GetSlicedBatch(), Data.GetSerializedChunk(), splitSizes); - std::vector> newChunks; - for (auto&& i : chunks) { - newChunks.emplace_back(std::make_shared(GetColumnId(), i, SchemaInfo)); - } - return newChunks; -} -TString TSplittedColumnChunk::DoDebugString() const { - return TStringBuilder() << "records_count=" << GetRecordsCount() << ";data=" << NArrow::DebugJson(Data.GetSlicedBatch(), 3, 3) << ";"; -} +#include -ui64 TSplittedColumnChunk::DoGetRawBytesImpl() const { - return NArrow::GetBatchDataSize(Data.GetSlicedBatch()); -} +namespace NKikimr::NArrow::NSplitter { std::vector TSimpleSplitter::Split(const std::shared_ptr& data, const std::shared_ptr& field, const ui32 maxBlobSize) const { AFL_VERIFY(data); @@ -38,27 +22,27 @@ class TSplitChunk { ui32 SplitFactor = 0; ui32 Iterations = 0; ui32 MaxBlobSize = 8 * 1024 * 1024; - TColumnSaver ColumnSaver; - std::shared_ptr Counters; + NAccessor::TColumnSaver ColumnSaver; + public: - TSplitChunk(const ui32 baseSplitFactor, const ui32 maxBlobSize, const std::shared_ptr& data, const TColumnSaver& columnSaver, const std::shared_ptr& counters) + TSplitChunk(const ui32 baseSplitFactor, const ui32 maxBlobSize, const std::shared_ptr& data, + const NAccessor::TColumnSaver& columnSaver) : Data(data) , SplitFactor(baseSplitFactor) , MaxBlobSize(maxBlobSize) , ColumnSaver(columnSaver) - , Counters(counters) { AFL_VERIFY(Data && Data->num_rows()); AFL_VERIFY(SplitFactor); } - TSplitChunk(const ui32 baseSplitFactor, const ui32 maxBlobSize, const std::shared_ptr& data, TString&& serializedData, const TColumnSaver& columnSaver, const std::shared_ptr& counters) + TSplitChunk(const ui32 baseSplitFactor, const ui32 maxBlobSize, const std::shared_ptr& data, TString&& serializedData, + const NAccessor::TColumnSaver& columnSaver) : Data(data) , Result(TSaverSplittedChunk(data, std::move(serializedData))) , SplitFactor(baseSplitFactor) , MaxBlobSize(maxBlobSize) , ColumnSaver(columnSaver) - , Counters(counters) { AFL_VERIFY(Data && Data->num_rows()); AFL_VERIFY(SplitFactor); @@ -68,18 +52,17 @@ class TSplitChunk { while (true) { AFL_VERIFY(!Result); AFL_VERIFY(++Iterations < 100); - AFL_VERIFY(SplitFactor <= Data->num_rows())("factor", SplitFactor)("records", Data->num_rows())("iteration", Iterations)("size", NArrow::GetBatchDataSize(Data)); + AFL_VERIFY(SplitFactor <= Data->num_rows())("factor", SplitFactor)("records", Data->num_rows())("iteration", Iterations)( + "size", NArrow::GetBatchDataSize(Data)); bool found = false; std::vector result; if (SplitFactor == 1) { TString blob = ColumnSaver.Apply(Data); if (blob.size() < MaxBlobSize) { - Counters->SimpleSplitter.OnCorrectSerialized(blob.size()); Result = TSaverSplittedChunk(Data, std::move(blob)); found = true; result.emplace_back(*this); } else { - Counters->SimpleSplitter.OnTrashSerialized(blob.size()); TBatchSerializationStat stats(blob.size(), Data->num_rows(), NArrow::GetBatchDataSize(Data)); SplitFactor = stats.PredictOptimalSplitFactor(Data->num_rows(), MaxBlobSize).value_or(1); if (SplitFactor == 1) { @@ -98,7 +81,6 @@ class TSplitChunk { auto slice = Data->Slice(it.GetPosition(), it.GetCurrentPackSize()); TString blob = ColumnSaver.Apply(slice); if (blob.size() >= MaxBlobSize) { - Counters->SimpleSplitter.OnTrashSerialized(blob.size()); if (!badStartPosition) { badStartPosition = it.GetPosition(); } @@ -107,25 +89,24 @@ class TSplitChunk { ++badBatchCount; Y_ABORT_UNLESS(!linearSplitting.IsMinimalGranularity()); } else { - Counters->SimpleSplitter.OnCorrectSerialized(blob.size()); if (badStartPosition) { AFL_VERIFY(badBatchRecordsCount && badBatchCount)("count", badBatchCount)("records", badBatchRecordsCount); auto badSlice = Data->Slice(*badStartPosition, badBatchRecordsCount); TBatchSerializationStat stats(badBatchSerializedSize, badBatchRecordsCount, Max()); - result.emplace_back(std::max(stats.PredictOptimalSplitFactor(badBatchRecordsCount, MaxBlobSize).value_or(1), badBatchCount) + 1, MaxBlobSize, badSlice, ColumnSaver, Counters); + result.emplace_back(std::max(stats.PredictOptimalSplitFactor(badBatchRecordsCount, MaxBlobSize).value_or(1), badBatchCount) + 1, MaxBlobSize, badSlice, ColumnSaver); badStartPosition = {}; badBatchRecordsCount = 0; badBatchCount = 0; badBatchSerializedSize = 0; } found = true; - result.emplace_back(1, MaxBlobSize, slice, std::move(blob), ColumnSaver, Counters); + result.emplace_back(1, MaxBlobSize, slice, std::move(blob), ColumnSaver); } } if (badStartPosition) { auto badSlice = Data->Slice(*badStartPosition, badBatchRecordsCount); TBatchSerializationStat stats(badBatchSerializedSize, badBatchRecordsCount, Max()); - result.emplace_back(std::max(stats.PredictOptimalSplitFactor(badBatchRecordsCount, MaxBlobSize).value_or(1), badBatchCount) + 1, MaxBlobSize, badSlice, ColumnSaver, Counters); + result.emplace_back(std::max(stats.PredictOptimalSplitFactor(badBatchRecordsCount, MaxBlobSize).value_or(1), badBatchCount) + 1, MaxBlobSize, badSlice, ColumnSaver); } ++SplitFactor; } @@ -139,9 +120,8 @@ class TSplitChunk { }; std::vector TSimpleSplitter::Split(const std::shared_ptr& data, const ui32 maxBlobSize) const { - AFL_VERIFY(data->num_columns() == 1); AFL_VERIFY(data->num_rows()); - TSplitChunk baseChunk(Stats ? Stats->PredictOptimalSplitFactor(data->num_rows(), maxBlobSize).value_or(1) : 1, maxBlobSize, data, ColumnSaver, Counters); + TSplitChunk baseChunk(Stats ? Stats->PredictOptimalSplitFactor(data->num_rows(), maxBlobSize).value_or(1) : 1, maxBlobSize, data, ColumnSaver); std::vector chunks = {baseChunk}; for (auto it = chunks.begin(); it != chunks.end(); ) { AFL_VERIFY(chunks.size() < 100); @@ -217,12 +197,4 @@ std::vector TSimpleSplitter::SplitBySizes(std::shared_ptr TSaverSplittedChunk::GetFirstScalar() const { - return NArrow::TStatusValidator::GetValid(SlicedBatch->column(0)->GetScalar(0)); -} - -std::shared_ptr TSaverSplittedChunk::GetLastScalar() const { - return NArrow::TStatusValidator::GetValid(SlicedBatch->column(0)->GetScalar(GetRecordsCount() - 1)); -} - } diff --git a/ydb/core/tx/columnshard/splitter/simple.h b/ydb/core/formats/arrow/splitter/simple.h similarity index 57% rename from ydb/core/tx/columnshard/splitter/simple.h rename to ydb/core/formats/arrow/splitter/simple.h index 01467190373d..5be5c0b9b5d2 100644 --- a/ydb/core/tx/columnshard/splitter/simple.h +++ b/ydb/core/formats/arrow/splitter/simple.h @@ -1,51 +1,25 @@ #pragma once #include -#include -#include #include "stats.h" -#include "chunks.h" #include "scheme_info.h" -namespace NKikimr::NOlap { +namespace NKikimr::NArrow::NSplitter { class TSaverSplittedChunk { private: YDB_READONLY_DEF(std::shared_ptr, SlicedBatch); YDB_READONLY_DEF(TString, SerializedChunk); public: - std::shared_ptr GetColumn() const { - return SlicedBatch->column(0); - } - ui32 GetRecordsCount() const { return SlicedBatch->num_rows(); } - std::shared_ptr GetFirstScalar() const; - std::shared_ptr GetLastScalar() const; - TSaverSplittedChunk(const std::shared_ptr& batch, TString&& serializedChunk) : SlicedBatch(batch) , SerializedChunk(std::move(serializedChunk)) { Y_ABORT_UNLESS(SlicedBatch); - Y_ABORT_UNLESS(SlicedBatch->num_columns() == 1); Y_ABORT_UNLESS(SlicedBatch->num_rows()); } - - bool IsCompatibleColumn(const std::shared_ptr& f) const { - if (!SlicedBatch) { - return false; - } - if (SlicedBatch->num_columns() != 1) { - AFL_ERROR(NKikimrServices::TX_COLUMNSHARD)("event", "unexpected columns count")("expectation", 1)("actual", SlicedBatch->num_columns()); - return false; - } - if (!SlicedBatch->schema()->fields().front()->Equals(f)) { - AFL_ERROR(NKikimrServices::TX_COLUMNSHARD)("event", "unexpected column type")("expectation", f->ToString())("actual", SlicedBatch->schema()->fields().front()->ToString()); - return false; - } - return true; - } }; class TLinearSplitInfo { @@ -109,13 +83,11 @@ class TLinearSplitInfo { class TSimpleSplitter { private: - TColumnSaver ColumnSaver; + NAccessor::TColumnSaver ColumnSaver; YDB_ACCESSOR_DEF(std::optional, Stats); - std::shared_ptr Counters; public: - explicit TSimpleSplitter(const TColumnSaver& columnSaver, std::shared_ptr counters) + explicit TSimpleSplitter(const NAccessor::TColumnSaver& columnSaver) : ColumnSaver(columnSaver) - , Counters(counters) { } @@ -146,56 +118,4 @@ class TSimpleSplitter { std::vector SplitBySizes(std::shared_ptr data, const TString& dataSerialization, const std::vector& splitPartSizesExt) const; }; -class TSplittedColumnChunk: public IPortionColumnChunk { -private: - using TBase = IPortionColumnChunk; - TSaverSplittedChunk Data; - ISchemaDetailInfo::TPtr SchemaInfo; -protected: - virtual std::vector> DoInternalSplitImpl(const TColumnSaver& saver, const std::shared_ptr& counters, const std::vector& splitSizes) const override; - virtual const TString& DoGetData() const override { - return Data.GetSerializedChunk(); - } - virtual ui64 DoGetRawBytesImpl() const override; - - virtual ui32 DoGetRecordsCountImpl() const override { - return Data.GetRecordsCount(); - } - - virtual TString DoDebugString() const override; - - virtual TSimpleChunkMeta DoBuildSimpleChunkMeta() const override { - return TSimpleChunkMeta(Data.GetColumn(), SchemaInfo->NeedMinMaxForColumn(GetColumnId()), SchemaInfo->IsSortedColumn(GetColumnId())); - } - - virtual std::shared_ptr DoGetFirstScalar() const override { - return Data.GetFirstScalar(); - } - virtual std::shared_ptr DoGetLastScalar() const override { - return Data.GetLastScalar(); - } - virtual std::shared_ptr DoCopyWithAnotherBlob(TString&& /*data*/, const TSimpleColumnInfo& /*columnInfo*/) const override { - AFL_VERIFY(false); - return nullptr; - } - -public: - i64 GetSize() const { - return Data.GetSerializedChunk().size(); - } - - const TSaverSplittedChunk& GetData() const { - return Data; - } - - TSplittedColumnChunk() = default; - - TSplittedColumnChunk(const ui32 columnId, const TSaverSplittedChunk& data, ISchemaDetailInfo::TPtr schemaInfo) - : TBase(columnId) - , Data(data) - , SchemaInfo(schemaInfo) { - - } -}; - } diff --git a/ydb/core/tx/columnshard/splitter/stats.cpp b/ydb/core/formats/arrow/splitter/stats.cpp similarity index 95% rename from ydb/core/tx/columnshard/splitter/stats.cpp rename to ydb/core/formats/arrow/splitter/stats.cpp index 3cb468bec5ad..c815485ada8e 100644 --- a/ydb/core/tx/columnshard/splitter/stats.cpp +++ b/ydb/core/formats/arrow/splitter/stats.cpp @@ -1,7 +1,7 @@ #include "stats.h" #include -namespace NKikimr::NOlap { +namespace NKikimr::NArrow::NSplitter { std::optional TSerializationStats::GetStatsForRecordBatch(const std::shared_ptr& schema) const { std::optional result; diff --git a/ydb/core/tx/columnshard/splitter/stats.h b/ydb/core/formats/arrow/splitter/stats.h similarity index 99% rename from ydb/core/tx/columnshard/splitter/stats.h rename to ydb/core/formats/arrow/splitter/stats.h index 695a7ab32dd7..447e59b68ca0 100644 --- a/ydb/core/tx/columnshard/splitter/stats.h +++ b/ydb/core/formats/arrow/splitter/stats.h @@ -11,7 +11,7 @@ #include #include -namespace NKikimr::NOlap { +namespace NKikimr::NArrow::NSplitter { class TSimpleSerializationStat { protected: diff --git a/ydb/core/formats/arrow/splitter/ya.make b/ydb/core/formats/arrow/splitter/ya.make new file mode 100644 index 000000000000..078d7ea83737 --- /dev/null +++ b/ydb/core/formats/arrow/splitter/ya.make @@ -0,0 +1,17 @@ +LIBRARY() + +SRCS( + stats.cpp + simple.cpp + scheme_info.cpp + similar_packer.cpp +) + +PEERDIR( + contrib/libs/apache/arrow + ydb/library/actors/core + ydb/library/conclusion + ydb/core/formats/arrow/serializer +) + +END() diff --git a/ydb/core/formats/arrow/common/validation.cpp b/ydb/core/formats/arrow/validation/validation.cpp similarity index 100% rename from ydb/core/formats/arrow/common/validation.cpp rename to ydb/core/formats/arrow/validation/validation.cpp diff --git a/ydb/core/formats/arrow/validation/validation.h b/ydb/core/formats/arrow/validation/validation.h new file mode 100644 index 000000000000..f71f18ece59c --- /dev/null +++ b/ydb/core/formats/arrow/validation/validation.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + +namespace NKikimr::NArrow { + +class TStatusValidator { +public: + static void Validate(const arrow::Status& status); + + template + static T GetValid(const arrow::Result& result) { + Validate(result.status()); + return *result; + } + + template + static T GetValid(arrow::Result&& result) { + Validate(result.status()); + return std::move(*result); + } +}; + +} diff --git a/ydb/core/formats/arrow/validation/ya.make b/ydb/core/formats/arrow/validation/ya.make new file mode 100644 index 000000000000..e060fae10d8e --- /dev/null +++ b/ydb/core/formats/arrow/validation/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +PEERDIR( + contrib/libs/apache/arrow + ydb/library/actors/core +) + +SRCS( + validation.cpp +) + +END() diff --git a/ydb/core/formats/arrow/ya.make b/ydb/core/formats/arrow/ya.make index 49938a884154..8bb86947266e 100644 --- a/ydb/core/formats/arrow/ya.make +++ b/ydb/core/formats/arrow/ya.make @@ -7,11 +7,14 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow ydb/core/scheme + ydb/core/formats/arrow/accessor ydb/core/formats/arrow/serializer ydb/core/formats/arrow/simple_builder ydb/core/formats/arrow/dictionary ydb/core/formats/arrow/transformer ydb/core/formats/arrow/reader + ydb/core/formats/arrow/save_load + ydb/core/formats/arrow/splitter ydb/core/formats/arrow/modifier ydb/core/formats/arrow/scalar ydb/core/formats/arrow/hash diff --git a/ydb/core/kqp/gateway/behaviour/tablestore/operations/alter_column.cpp b/ydb/core/kqp/gateway/behaviour/tablestore/operations/alter_column.cpp index 1467a61bc38e..b000a2fd94a4 100644 --- a/ydb/core/kqp/gateway/behaviour/tablestore/operations/alter_column.cpp +++ b/ydb/core/kqp/gateway/behaviour/tablestore/operations/alter_column.cpp @@ -16,15 +16,21 @@ TConclusionStatus TAlterColumnOperation::DoDeserialize(NYql::TObjectSettingsImpl if (StorageId && !*StorageId) { return TConclusionStatus::Fail("STORAGE_ID cannot be empty string"); } + { + auto status = AccessorConstructor.DeserializeFromRequest(features); + if (status.IsFail()) { + return status; + } + } { auto result = DictionaryEncodingDiff.DeserializeFromRequestFeatures(features); - if (!result) { - return TConclusionStatus::Fail(result.GetErrorMessage()); + if (result.IsFail()) { + return result; } } { auto status = Serializer.DeserializeFromRequest(features); - if (!status) { + if (status.IsFail()) { return status; } } @@ -40,6 +46,9 @@ void TAlterColumnOperation::DoSerializeScheme(NKikimrSchemeOp::TAlterColumnTable if (!!Serializer) { Serializer.SerializeToProto(*column->MutableSerializer()); } + if (!!AccessorConstructor) { + *column->MutableDataAccessorConstructor() = AccessorConstructor.SerializeToProto(); + } *column->MutableDictionaryEncoding() = DictionaryEncodingDiff.SerializeToProto(); if (DefaultValue) { column->SetDefaultValue(*DefaultValue); diff --git a/ydb/core/kqp/gateway/behaviour/tablestore/operations/alter_column.h b/ydb/core/kqp/gateway/behaviour/tablestore/operations/alter_column.h index ee51b47bb8df..23d1aef28abb 100644 --- a/ydb/core/kqp/gateway/behaviour/tablestore/operations/alter_column.h +++ b/ydb/core/kqp/gateway/behaviour/tablestore/operations/alter_column.h @@ -1,4 +1,5 @@ #include "abstract.h" +#include #include #include @@ -18,6 +19,7 @@ class TAlterColumnOperation : public ITableStoreOperation { NArrow::NSerialization::TSerializerContainer Serializer; NArrow::NDictionary::TEncodingDiff DictionaryEncodingDiff; std::optional DefaultValue; + NArrow::NAccessor::TRequestedConstructorContainer AccessorConstructor; public: TConclusionStatus DoDeserialize(NYql::TObjectSettingsImpl::TFeaturesExtractor& features) override; diff --git a/ydb/core/kqp/runtime/kqp_scan_data.cpp b/ydb/core/kqp/runtime/kqp_scan_data.cpp index 2e8b430681e9..2fe0ec750c0f 100644 --- a/ydb/core/kqp/runtime/kqp_scan_data.cpp +++ b/ydb/core/kqp/runtime/kqp_scan_data.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include diff --git a/ydb/core/kqp/ut/olap/sparsed_ut.cpp b/ydb/core/kqp/ut/olap/sparsed_ut.cpp new file mode 100644 index 000000000000..d3fd08ed17ab --- /dev/null +++ b/ydb/core/kqp/ut/olap/sparsed_ut.cpp @@ -0,0 +1,78 @@ +#include "helpers/local.h" +#include "helpers/writer.h" +#include "helpers/typed_local.h" +#include "helpers/query_executor.h" +#include "helpers/get_value.h" + +#include +#include +#include +#include + +namespace NKikimr::NKqp { + +Y_UNIT_TEST_SUITE(KqpOlapSparsed) { + + Y_UNIT_TEST(DefaultValues) { + auto settings = TKikimrSettings().SetWithSampleTables(false); + TKikimrRunner kikimr(settings); + auto csController = NKikimr::NYDBTest::TControllers::RegisterCSControllerGuard(); + csController->SetPeriodicWakeupActivationPeriod(TDuration::MilliSeconds(100)); + Tests::NCommon::TLoggerInit(kikimr).Initialize(); + TTypedLocalHelper helper("Utf8", kikimr); + helper.CreateTestOlapTable(); + const auto addData = [&](const double shiftKff, const ui32 expectation, const ui32 expectationCount) { + const double frq = 0.9; + { + NArrow::NConstruction::TStringPoolFiller sPool(1000, 52, "abcde", frq); + helper.FillTable(sPool, shiftKff, 1000); + } + csController->WaitIndexation(TDuration::Seconds(5)); + csController->WaitCompactions(TDuration::Seconds(5)); + ui32 defCount = 0; + { + auto selectQuery = TString(R"( + SELECT + count(*) as count, + FROM `/Root/olapStore/olapTable` + WHERE field = 'abcde' + )"); + + auto tableClient = kikimr.GetTableClient(); + auto rows = ExecuteScanQuery(tableClient, selectQuery); + defCount = GetUint64(rows[0].at("count")); + } +// AFL_VERIFY(GetUint64(rows[0].at("count")) < expectation + delta)("count", GetUint64(rows[0].at("count")))("expect", expectation); +// AFL_VERIFY(expectation - delta <= GetUint64(rows[0].at("count")))("count", GetUint64(rows[0].at("count")))("expect", expectation); + { + auto selectQuery = TString(R"( + SELECT + count(*) as count, + FROM `/Root/olapStore/olapTable` + )"); + + auto tableClient = kikimr.GetTableClient(); + auto rows = ExecuteScanQuery(tableClient, selectQuery); + AFL_VERIFY(expectationCount == GetUint64(rows[0].at("count"))); + } + AFL_VERIFY(std::abs(1.0 * defCount / expectationCount - frq) < 0.5 * frq); + AFL_VERIFY(1.0 * defCount / expectationCount > 0.3 * frq); + }; + addData(0, 900, 1000); + + helper.ExecuteSchemeQuery("ALTER OBJECT `/Root/olapStore` (TYPE TABLESTORE) SET (ACTION=ALTER_COLUMN, NAME=field, `DATA_ACCESSOR_CONSTRUCTOR.CLASS_NAME`=`SPARSED`, `DEFAULT_VALUE`=`abcde`);"); + addData(0.1, 900, 1100); + + helper.ExecuteSchemeQuery("ALTER OBJECT `/Root/olapStore` (TYPE TABLESTORE) SET (ACTION=ALTER_COLUMN, NAME=field, `DATA_ACCESSOR_CONSTRUCTOR.CLASS_NAME`=`PLAIN`);"); + addData(0.2, 900, 1200); + + helper.ExecuteSchemeQuery("ALTER OBJECT `/Root/olapStore` (TYPE TABLESTORE) SET (ACTION=ALTER_COLUMN, NAME=field, `DATA_ACCESSOR_CONSTRUCTOR.CLASS_NAME`=`SPARSED`);"); + addData(0.3, 900, 1300); + + helper.ExecuteSchemeQuery("ALTER OBJECT `/Root/olapStore` (TYPE TABLESTORE) SET (ACTION=ALTER_COLUMN, NAME=field, `DATA_ACCESSOR_CONSTRUCTOR.CLASS_NAME`=`PLAIN`);"); + addData(0.4, 900, 1400); + } + +} + +} // namespace diff --git a/ydb/core/kqp/ut/olap/ya.make b/ydb/core/kqp/ut/olap/ya.make index d9b7a06ef77b..06deb96569b9 100644 --- a/ydb/core/kqp/ut/olap/ya.make +++ b/ydb/core/kqp/ut/olap/ya.make @@ -23,6 +23,7 @@ SRCS( clickbench_ut.cpp aggregations_ut.cpp write_ut.cpp + sparsed_ut.cpp ) PEERDIR( diff --git a/ydb/core/protos/flat_scheme_op.proto b/ydb/core/protos/flat_scheme_op.proto index b0372ba5d1c9..e2b59b83444a 100644 --- a/ydb/core/protos/flat_scheme_op.proto +++ b/ydb/core/protos/flat_scheme_op.proto @@ -20,6 +20,7 @@ import "ydb/library/mkql_proto/protos/minikql.proto"; import "ydb/core/protos/index_builder.proto"; import "ydb/core/tx/columnshard/engines/scheme/defaults/protos/data.proto"; import "ydb/core/tx/columnshard/common/protos/snapshot.proto"; +import "ydb/core/formats/arrow/protos/accessor.proto"; import "google/protobuf/empty.proto"; import "google/protobuf/struct.proto"; @@ -426,6 +427,7 @@ message TOlapColumnDiff { optional TOlapColumn.TSerializer Serializer = 5; optional string StorageId = 6; optional string DefaultValue = 7; + optional NKikimrArrowAccessorProto.TRequestedConstructor DataAccessorConstructor = 8; } message TOlapColumnDescription { @@ -445,6 +447,7 @@ message TOlapColumnDescription { optional TOlapColumn.TSerializer Serializer = 10; optional string StorageId = 11; optional NKikimrColumnShardColumnDefaults.TColumnDefault DefaultValue = 12; + optional NKikimrArrowAccessorProto.TConstructor DataAccessorConstructor = 13; } message TRequestedBloomFilter { diff --git a/ydb/core/testlib/cs_helper.cpp b/ydb/core/testlib/cs_helper.cpp index c6f05ec8c86f..3d20b83b217c 100644 --- a/ydb/core/testlib/cs_helper.cpp +++ b/ydb/core/testlib/cs_helper.cpp @@ -183,7 +183,7 @@ std::shared_ptr THelper::TestArrowBatch(ui64 pathIdBegin, ui TString THelper::GetTestTableSchema() const { TStringBuilder sb; sb << R"(Columns{ Name: "timestamp" Type : "Timestamp" NotNull : true })"; - sb << R"(Columns{ Name: "resource_id" Type : "Utf8" })"; + sb << R"(Columns{ Name: "resource_id" Type : "Utf8" DataAccessorConstructor{ ClassName: "SPARSED" } })"; sb << "Columns{ Name: \"uid\" Type : \"Utf8\" NotNull : true StorageId : \"" + OptionalStorageId + "\" }"; sb << R"(Columns{ Name: "level" Type : "Int32" })"; sb << "Columns{ Name: \"message\" Type : \"Utf8\" StorageId : \"" + OptionalStorageId + "\" }"; diff --git a/ydb/core/testlib/cs_helper.h b/ydb/core/testlib/cs_helper.h index 7a9e3dad1bf3..e8e87ade3ad2 100644 --- a/ydb/core/testlib/cs_helper.h +++ b/ydb/core/testlib/cs_helper.h @@ -43,9 +43,9 @@ class THelper: public THelperSchemaless { static constexpr const char * PROTO_SCHEMA = R"( Columns { Name: "timestamp" Type: "Timestamp" NotNull: true } - Columns { Name: "resource_id" Type: "Utf8" } + Columns { Name: "resource_id" Type: "Utf8" DataAccessorConstructor{ ClassName: "SPARSED" }} Columns { Name: "uid" Type: "Utf8" } - Columns { Name: "level" Type: "Int32" } + Columns { Name: "level" Type: "Int32" DataAccessorConstructor{ ClassName: "SPARSED" }} Columns { Name: "message" Type: "Utf8" } KeyColumnNames: "timestamp" Engine: COLUMN_ENGINE_REPLACING_TIMESERIES diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/abstract/merger.h b/ydb/core/tx/columnshard/engines/changes/compaction/abstract/merger.h index 1e93f0c70979..f66a2c17277e 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/abstract/merger.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/abstract/merger.h @@ -4,13 +4,18 @@ namespace NKikimr::NOlap::NCompaction { class IColumnMerger { +public: + using TFactory = NObjectFactory::TParametrizedObjectFactory; private: bool Started = false; virtual std::vector DoExecute( - const NCompaction::TColumnMergeContext& context, const arrow::UInt16Array& pIdxArray, const arrow::UInt32Array& pRecordIdxArray) = 0; + const TChunkMergeContext& context, const arrow::UInt16Array& pIdxArray, const arrow::UInt32Array& pRecordIdxArray) = 0; virtual void DoStart(const std::vector>& input) = 0; +protected: + const TColumnMergeContext& Context; + public: static inline const TString PortionIdFieldName = "$$__portion_id"; static inline const TString PortionRecordIndexFieldName = "$$__portion_record_idx"; @@ -19,6 +24,11 @@ class IColumnMerger { static inline const std::shared_ptr PortionRecordIndexField = std::make_shared(PortionRecordIndexFieldName, std::make_shared()); + IColumnMerger(const TColumnMergeContext& context) + : Context(context) + { + + } virtual ~IColumnMerger() = default; void Start(const std::vector>& input) { @@ -27,8 +37,7 @@ class IColumnMerger { return DoStart(input); } - std::vector Execute( - const NCompaction::TColumnMergeContext& context, const std::shared_ptr& remap) { + std::vector Execute(const TChunkMergeContext& context, const std::shared_ptr& remap) { auto columnPortionIdx = remap->GetColumnByName(IColumnMerger::PortionIdFieldName); auto columnPortionRecordIdx = remap->GetColumnByName(IColumnMerger::PortionRecordIndexFieldName); diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/common/context.h b/ydb/core/tx/columnshard/engines/changes/compaction/common/context.h index 80356224909f..f5bf08b4fc14 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/common/context.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/common/context.h @@ -1,8 +1,8 @@ #pragma once +#include #include #include #include -#include #include @@ -12,25 +12,29 @@ class TColumnMergeContext { private: YDB_READONLY(ui32, ColumnId, 0); ISnapshotSchema::TPtr SchemaInfo; - YDB_READONLY_DEF(TColumnSaver, Saver); + YDB_ACCESSOR_DEF(TColumnSaver, Saver); YDB_READONLY_DEF(std::shared_ptr, Loader); YDB_READONLY_DEF(std::shared_ptr, ResultField); - YDB_READONLY(ui32, PortionRowsCountLimit, 10000); YDB_READONLY(ui64, ChunkPackedBytesLimit, 7 * 1024 * 1024); YDB_READONLY(ui64, ExpectedBlobPackedBytes, 4 * 1024 * 1024); YDB_READONLY(ui64, ChunkRawBytesLimit, 50 * 1024 * 1024); YDB_READONLY(ui64, StorePackedChunkSizeLimit, 512 * 1024); YDB_READONLY(bool, UseWholeChunksOptimization, true); - std::optional ColumnStat; + std::optional ColumnStat; const TIndexInfo& IndexInfo; + public: + std::shared_ptr GetDefaultValue() const { + return Loader->GetDefaultValue(); + } + ISnapshotSchema::TPtr GetSchemaInfo() const { return SchemaInfo; } - const std::optional& GetColumnStat() const { + const std::optional& GetColumnStat() const { return ColumnStat; } @@ -42,25 +46,29 @@ class TColumnMergeContext { return IndexInfo; } - TColumnMergeContext(const ui32 columnId, const ISnapshotSchema::TPtr& schema, const ui32 portionRowsCountLimit, - const ui32 chunkRawBytesLimit, const std::optional& columnStat, - const NArrow::NSerialization::TSerializerContainer& overrideSerializer) + TColumnMergeContext(const ui32 columnId, const ISnapshotSchema::TPtr& schema, const ui32 chunkRawBytesLimit, + const std::optional& columnStat) : ColumnId(columnId) , SchemaInfo(schema) , Saver(schema->GetColumnSaver(columnId)) , Loader(schema->GetColumnLoaderOptional(columnId)) , ResultField(schema->GetIndexInfo().GetColumnFieldVerified(columnId)) - , PortionRowsCountLimit(portionRowsCountLimit) , ChunkRawBytesLimit(chunkRawBytesLimit) , UseWholeChunksOptimization(!schema->GetIndexInfo().GetReplaceKey()->GetFieldByName(ResultField->name())) , ColumnStat(columnStat) , IndexInfo(schema->GetIndexInfo()) { - Y_ABORT_UNLESS(PortionRowsCountLimit); Y_ABORT_UNLESS(ChunkRawBytesLimit); - if (!!overrideSerializer) { - Saver.ResetSerializer(overrideSerializer); - } } }; -} +class TChunkMergeContext { +private: + YDB_READONLY(ui32, PortionRowsCountLimit, 10000); + +public: + TChunkMergeContext(const ui32 portionRowsCountLimit) + : PortionRowsCountLimit(portionRowsCountLimit) { + Y_ABORT_UNLESS(PortionRowsCountLimit); + } +}; +} // namespace NKikimr::NOlap::NCompaction diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp b/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp index 856c4107da8b..d2c79c4a7bbe 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp +++ b/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp @@ -2,6 +2,7 @@ #include "abstract/merger.h" #include "plain/logic.h" +#include "sparsed/logic.h" #include #include @@ -11,7 +12,7 @@ namespace NKikimr::NOlap::NCompaction { -std::vector TMerger::Execute(const std::shared_ptr& stats, +std::vector TMerger::Execute(const std::shared_ptr& stats, const NArrow::NMerger::TIntervalPositions& checkPoints, const std::shared_ptr& resultFiltered, const ui64 pathId, const std::optional shardingActualVersion) { AFL_VERIFY(Batches.size() == Filters.size()); @@ -55,7 +56,20 @@ std::vector TMerger::Execute(c NActors::TLogContextBuilder::Build()("field_name", resultFiltered->GetIndexInfo().GetColumnName(columnId))); auto columnInfo = stats->GetColumnInfo(columnId); auto resultField = resultFiltered->GetIndexInfo().GetColumnFieldVerified(columnId); - std::shared_ptr merger = std::make_shared(); + + TColumnMergeContext commonContext( + columnId, resultFiltered, NSplitter::TSplitSettings().GetExpectedUnpackColumnChunkRawSize(), columnInfo); + if (OptimizationWritingPackMode) { + commonContext.MutableSaver().AddSerializerWithBorder( + 100, std::make_shared(arrow::Compression::type::UNCOMPRESSED)); + commonContext.MutableSaver().AddSerializerWithBorder( + Max(), std::make_shared(arrow::Compression::type::LZ4_FRAME)); + } + + THolder merger = + IColumnMerger::TFactory::MakeHolder(commonContext.GetLoader()->GetAccessorConstructor().GetClassName(), commonContext); + AFL_VERIFY(!!merger)("problem", "cannot create merger")( + "class_name", commonContext.GetLoader()->GetAccessorConstructor().GetClassName()); // resultFiltered->BuildColumnMergerVerified(columnId); { @@ -73,19 +87,7 @@ std::vector TMerger::Execute(c const ui32 portionRecordsCountLimit = batchResult->num_rows() / (batchResult->num_rows() / NSplitter::TSplitSettings().GetExpectedRecordsCountOnPage() + 1) + 1; - NArrow::NSerialization::TSerializerContainer externalSaver; - if (OptimizationWritingPackMode) { - if (batchResult->num_rows() < 100) { - externalSaver = NArrow::NSerialization::TSerializerContainer( - std::make_shared(arrow::Compression::type::UNCOMPRESSED)); - } else { - externalSaver = NArrow::NSerialization::TSerializerContainer( - std::make_shared(arrow::Compression::type::LZ4_FRAME)); - } - } - - NCompaction::TColumnMergeContext context(columnId, resultFiltered, portionRecordsCountLimit, - NSplitter::TSplitSettings().GetExpectedUnpackColumnChunkRawSize(), columnInfo, externalSaver); + TChunkMergeContext context(portionRecordsCountLimit); chunkGroups[batchIdx][columnId] = merger->Execute(context, batchResult); ++batchIdx; @@ -128,7 +130,7 @@ std::vector TMerger::Execute(c } batchSlices.emplace_back(portionColumns, schemaDetails, Context.Counters.SplitterCounters); } - TSimilarPacker slicer(NSplitter::TSplitSettings().GetExpectedPortionSize()); + NArrow::NSplitter::TSimilarPacker slicer(NSplitter::TSplitSettings().GetExpectedPortionSize()); auto packs = slicer.Split(batchSlices); ui32 recordIdx = 0; diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/merger.h b/ydb/core/tx/columnshard/engines/changes/compaction/merger.h index be9beae47584..ed862f2e25dd 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/merger.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/merger.h @@ -1,8 +1,8 @@ #pragma once -#include #include +#include #include -#include +#include #include #include #include @@ -25,14 +25,11 @@ class TMerger { TMerger(const TConstructionContext& context, const TSaverContext& saverContext) : Context(context) - , SaverContext(saverContext) - { - + , SaverContext(saverContext) { } TMerger(const TConstructionContext& context, const TSaverContext& saverContext, - std::vector>&& batches, - std::vector>&& filters) + std::vector>&& batches, std::vector>&& filters) : Batches(std::move(batches)) , Filters(std::move(filters)) , Context(context) @@ -40,9 +37,8 @@ class TMerger { AFL_VERIFY(Batches.size() == Filters.size()); } - std::vector Execute( - const std::shared_ptr& stats, - const NArrow::NMerger::TIntervalPositions& checkPoints, - const std::shared_ptr& resultFiltered, const ui64 pathId, const std::optional shardingActualVersion); + std::vector Execute(const std::shared_ptr& stats, + const NArrow::NMerger::TIntervalPositions& checkPoints, const std::shared_ptr& resultFiltered, + const ui64 pathId, const std::optional shardingActualVersion); }; -} +} // namespace NKikimr::NOlap::NCompaction diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.cpp b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.cpp index 1cd921676f01..dde08cabb4fc 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.cpp +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.cpp @@ -1,52 +1,27 @@ #include "column_portion_chunk.h" + +#include #include -#include #include #include namespace NKikimr::NOlap::NCompaction { -std::shared_ptr TColumnPortion::AppendBlob(const TString& data, const TColumnRecord& columnChunk, ui32& remained) { -// if (CurrentPortionRecords + columnChunk.GetMeta().GetNumRows() <= Context.GetPortionRowsCountLimit() && -// columnChunk.GetMeta().GetRawBytes() < Context.GetChunkRawBytesLimit() && -// data.size() < Context.GetChunkPackedBytesLimit() && -// columnChunk.GetMeta().GetRawBytes() > Context.GetStorePackedChunkSizeLimit() && Context.GetSaver().IsHardPacker() && -// Context.GetUseWholeChunksOptimization()) -// { -// NChanges::TGeneralCompactionCounters::OnFullBlobAppend(columnChunk.BlobRange.GetBlobSize()); -// FlushBuffer(); -// Chunks.emplace_back(std::make_shared(data, columnChunk, Context.GetSchemaInfo())); -// PackedSize += Chunks.back()->GetPackedSize(); -// CurrentPortionRecords += columnChunk.GetMeta().GetNumRows(); -// return nullptr; -// } else { - NChanges::TGeneralCompactionCounters::OnSplittedBlobAppend(columnChunk.BlobRange.GetSize()); - auto batch = NArrow::TStatusValidator::GetValid(Context.GetLoader()->Apply(data)); - AFL_VERIFY(batch->num_columns() == 1); - auto batchArray = batch->column(0); - remained = AppendSlice(batchArray, 0, batch->num_rows()); - if (remained) { - return batchArray; - } else { - return nullptr; - } -// } -} - ui32 TColumnPortion::AppendSlice(const std::shared_ptr& a, const ui32 startIndex, const ui32 length) { Y_ABORT_UNLESS(a); Y_ABORT_UNLESS(length); - Y_ABORT_UNLESS(CurrentPortionRecords < Context.GetPortionRowsCountLimit()); + Y_ABORT_UNLESS(CurrentPortionRecords < ChunkContext.GetPortionRowsCountLimit()); Y_ABORT_UNLESS(startIndex + length <= a->length()); AFL_VERIFY(Type->id() == a->type_id())("own", Type->ToString())("a", a->type()->ToString()); ui32 i = startIndex; const ui32 packedRecordSize = Context.GetColumnStat() ? Context.GetColumnStat()->GetPackedRecordSize() : 0; for (; i < startIndex + length; ++i) { ui64 recordSize = 0; - AFL_VERIFY(NArrow::Append(*Builder, *a, i, &recordSize))("a", a->ToString())("a_type", a->type()->ToString())("builder_type", Builder->type()->ToString()); + AFL_VERIFY(NArrow::Append(*Builder, *a, i, &recordSize))("a", a->ToString())("a_type", a->type()->ToString())( + "builder_type", Builder->type()->ToString()); CurrentChunkRawSize += recordSize; PredictedPackedBytes += packedRecordSize ? packedRecordSize : (recordSize / 2); - if (++CurrentPortionRecords == Context.GetPortionRowsCountLimit()) { + if (++CurrentPortionRecords == ChunkContext.GetPortionRowsCountLimit()) { FlushBuffer(); ++i; break; @@ -59,17 +34,17 @@ ui32 TColumnPortion::AppendSlice(const std::shared_ptr& a, const u } bool TColumnPortion::FlushBuffer() { - if (Builder->length()) { - auto newArrayChunk = NArrow::TStatusValidator::GetValid(Builder->Finish()); - Chunks.emplace_back(std::make_shared(Context.GetSaver().Apply(newArrayChunk, Context.GetResultField()), newArrayChunk, TChunkAddress(Context.GetColumnId(), 0), ColumnInfo)); - Builder = Context.MakeBuilder(); - CurrentChunkRawSize = 0; - PredictedPackedBytes = 0; - PackedSize += Chunks.back()->GetPackedSize(); - return true; - } else { + if (!Builder->length()) { return false; } + auto newArrayChunk = NArrow::TStatusValidator::GetValid(Builder->Finish()); + Chunks.emplace_back(std::make_shared(Context.GetSaver().Apply(newArrayChunk, Context.GetResultField()), + std::make_shared(newArrayChunk), TChunkAddress(Context.GetColumnId(), 0), ColumnInfo)); + Builder = Context.MakeBuilder(); + CurrentChunkRawSize = 0; + PredictedPackedBytes = 0; + PackedSize += Chunks.back()->GetPackedSize(); + return true; } -} +} // namespace NKikimr::NOlap::NCompaction diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.h b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.h index 98fe703f7e1a..ce10642ae95d 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.h @@ -17,6 +17,7 @@ class TColumnPortion: public TColumnPortionResult { std::unique_ptr Builder; std::shared_ptr Type; const TColumnMergeContext& Context; + const TChunkMergeContext& ChunkContext; YDB_READONLY(ui64, CurrentChunkRawSize, 0); double PredictedPackedBytes = 0; const TSimpleColumnInfo ColumnInfo; @@ -24,22 +25,22 @@ class TColumnPortion: public TColumnPortionResult { ui64 CurrentPortionRecords = 0; public: - TColumnPortion(const TColumnMergeContext& context) + TColumnPortion(const TColumnMergeContext& context, const TChunkMergeContext& chunkContext) : TBase(context.GetColumnId()) , Context(context) + , ChunkContext(chunkContext) , ColumnInfo(Context.GetIndexInfo().GetColumnFeaturesVerified(context.GetColumnId())) { Builder = Context.MakeBuilder(); Type = Builder->type(); } bool IsFullPortion() const { - Y_ABORT_UNLESS(CurrentPortionRecords <= Context.GetPortionRowsCountLimit()); - return CurrentPortionRecords == Context.GetPortionRowsCountLimit(); + Y_ABORT_UNLESS(CurrentPortionRecords <= ChunkContext.GetPortionRowsCountLimit()); + return CurrentPortionRecords == ChunkContext.GetPortionRowsCountLimit(); } bool FlushBuffer(); - std::shared_ptr AppendBlob(const TString& data, const TColumnRecord& columnChunk, ui32& remained); ui32 AppendSlice(const std::shared_ptr& a, const ui32 startIndex, const ui32 length); }; diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.cpp b/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.cpp index ac8cb351c572..328a9828a495 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.cpp +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.cpp @@ -8,9 +8,9 @@ void TPlainMerger::DoStart(const std::vector TPlainMerger::DoExecute( - const NCompaction::TColumnMergeContext& context, const arrow::UInt16Array& pIdxArray, const arrow::UInt32Array& pRecordIdxArray) { - NCompaction::TMergedColumn mColumn(context); +std::vector TPlainMerger::DoExecute(const TChunkMergeContext& chunkContext, + const arrow::UInt16Array& pIdxArray, const arrow::UInt32Array& pRecordIdxArray) { + NCompaction::TMergedColumn mColumn(Context, chunkContext); std::optional predPortionIdx; for (ui32 idx = 0; idx < pIdxArray.length(); ++idx) { diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.h b/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.h index 995cd1c33a72..9fce12856d54 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.h @@ -1,19 +1,23 @@ #pragma once #include "column_cursor.h" -#include +#include +#include #include namespace NKikimr::NOlap::NCompaction { class TPlainMerger: public IColumnMerger { private: + static inline auto Registrator = TFactory::TRegistrator(NArrow::NAccessor::TGlobalConst::PlainDataAccessorName); + using TBase = IColumnMerger; std::vector Cursors; virtual void DoStart(const std::vector>& input) override; - virtual std::vector DoExecute(const NCompaction::TColumnMergeContext& context, const arrow::UInt16Array& pIdxArray, - const arrow::UInt32Array& pRecordIdxArray) override; + virtual std::vector DoExecute( + const TChunkMergeContext& context, const arrow::UInt16Array& pIdxArray, const arrow::UInt32Array& pRecordIdxArray) override; public: + using TBase::TBase; }; } // namespace NKikimr::NOlap::NCompaction diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/merged_column.cpp b/ydb/core/tx/columnshard/engines/changes/compaction/plain/merged_column.cpp index 5f638a30f155..84dd8608ffc4 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/merged_column.cpp +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/merged_column.cpp @@ -2,20 +2,6 @@ namespace NKikimr::NOlap::NCompaction { -void TMergedColumn::AppendBlob(const TString& data, const TColumnRecord& columnChunk) { - RecordsCount += columnChunk.GetMeta().GetNumRows(); - ui32 remained; - std::shared_ptr dataArray = Portions.back().AppendBlob(data, columnChunk, remained); - while (remained) { - Y_ABORT_UNLESS(Portions.back().IsFullPortion()); - NewPortion(); - remained = Portions.back().AppendSlice(dataArray, dataArray->length() - remained, remained); - } - if (Portions.back().IsFullPortion()) { - NewPortion(); - } -} - void TMergedColumn::AppendSlice(const std::shared_ptr& data, const ui32 startIndex, const ui32 length) { RecordsCount += length; Y_ABORT_UNLESS(data); @@ -44,7 +30,7 @@ void TMergedColumn::NewPortion() { if (Portions.size()) { Portions.back().FlushBuffer(); } - Portions.emplace_back(TColumnPortion(Context)); + Portions.emplace_back(TColumnPortion(Context, ChunkContext)); } } diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/merged_column.h b/ydb/core/tx/columnshard/engines/changes/compaction/plain/merged_column.h index 9dee31b84215..2433bb8f4862 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/merged_column.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/merged_column.h @@ -9,18 +9,20 @@ namespace NKikimr::NOlap::NCompaction { class TMergedColumn { private: TColumnMergeContext Context; + TChunkMergeContext ChunkContext; YDB_READONLY_DEF(std::vector, Portions); YDB_READONLY(ui32, RecordsCount, 0); void NewPortion(); public: - TMergedColumn(const TColumnMergeContext& context) - : Context(context) { + TMergedColumn(const TColumnMergeContext& context, const TChunkMergeContext& chunkContext) + : Context(context) + , ChunkContext(chunkContext) + { NewPortion(); } - void AppendBlob(const TString& data, const TColumnRecord& columnChunk); void AppendSlice(const std::shared_ptr& data, const ui32 startIndex, const ui32 length); std::vector BuildResult(); diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/ya.make b/ydb/core/tx/columnshard/engines/changes/compaction/plain/ya.make index 64de6caea075..91991ea51097 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/ya.make +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/ya.make @@ -4,7 +4,7 @@ SRCS( column_cursor.cpp column_portion_chunk.cpp merged_column.cpp - logic.cpp + GLOBAL logic.cpp ) PEERDIR( diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.cpp b/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.cpp new file mode 100644 index 000000000000..abf68862c705 --- /dev/null +++ b/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.cpp @@ -0,0 +1,125 @@ +#include "logic.h" +#include +#include + +namespace NKikimr::NOlap::NCompaction { + +void TSparsedMerger::DoStart(const std::vector>& input) { + for (auto&& p : input) { + Cursors.emplace_back(p, Context); + } +} + +std::vector TSparsedMerger::DoExecute( + const TChunkMergeContext& chunkContext, const arrow::UInt16Array& pIdxArray, const arrow::UInt32Array& pRecordIdxArray) { + std::vector result; + std::shared_ptr writer = std::make_shared(Context); + for (ui32 idx = 0; idx < pIdxArray.length(); ++idx) { + const ui16 portionIdx = pIdxArray.Value(idx); + const ui32 portionRecordIdx = pRecordIdxArray.Value(idx); + auto& cursor = Cursors[portionIdx]; + + cursor.AddIndexTo(portionRecordIdx, *writer); + if (writer->AddPosition() == chunkContext.GetPortionRowsCountLimit()) { + result.emplace_back(writer->Flush()); + writer = std::make_shared(Context); + } + } + if (writer->HasData()) { + result.emplace_back(writer->Flush()); + } + return result; +} + +void TSparsedMerger::TWriter::AddRealData(const std::shared_ptr& arr, const ui32 index) { + AFL_VERIFY(arr); + AFL_VERIFY(NArrow::Append(*ValueBuilder, *arr, index)); + NArrow::TStatusValidator::Validate(IndexBuilderImpl->Append(CurrentRecordIdx)); + ++UsefulRecordsCount; +} + +TColumnPortionResult TSparsedMerger::TWriter::Flush() { + std::vector> fields = { std::make_shared("index", arrow::uint32()), + std::make_shared("value", DataType) }; + auto schema = std::make_shared(fields); + std::vector> columns = { NArrow::TStatusValidator::GetValid(IndexBuilder->Finish()), + NArrow::TStatusValidator::GetValid(ValueBuilder->Finish()) }; + + auto recordBatch = arrow::RecordBatch::Make(schema, UsefulRecordsCount, columns); + NArrow::NAccessor::TSparsedArray::TBuilder builder( + Context.GetIndexInfo().GetColumnFeaturesVerified(Context.GetColumnId()).GetDefaultValue().GetValue(), Context.GetResultField()->type()); + builder.AddChunk(CurrentRecordIdx, recordBatch); + Chunks.emplace_back(std::make_shared(Context.GetSaver().Apply(recordBatch), builder.Finish(), + TChunkAddress(ColumnId, 0), Context.GetIndexInfo().GetColumnFeaturesVerified(ColumnId))); + return *this; +} + +TSparsedMerger::TWriter::TWriter(const TColumnMergeContext& context) + : TBase(context.GetColumnId()) + , DataType(context.GetResultField()->type()) + , Context(context) { + IndexBuilder = NArrow::MakeBuilder(arrow::uint32()); + ValueBuilder = NArrow::MakeBuilder(DataType); + IndexBuilderImpl = (arrow::UInt32Builder*)(IndexBuilder.get()); +} + +bool TSparsedMerger::TCursor::AddIndexTo(const ui32 index, TWriter& writer) { + if (index < NextGlobalPosition) { + return false; + } else if (index == NextGlobalPosition) { + if (index == CommonShift + Chunk->GetRecordsCount()) { + InitArrays(index); + if (index != NextGlobalPosition) { + return false; + } + } + writer.AddRealData(Chunk->GetColValue(), NextLocalPosition); + if (++NextLocalPosition < Chunk->GetNotDefaultRecordsCount()) { + NextGlobalPosition = CommonShift + Chunk->GetIndexUnsafeFast(NextLocalPosition); + return true; + } else { + NextGlobalPosition = CommonShift + Chunk->GetRecordsCount(); + return false; + } + } + AFL_VERIFY(Chunk->GetStartPosition() <= index); + if (CommonShift + Chunk->GetRecordsCount() <= index) { + InitArrays(index); + } + bool found = false; + for (; NextLocalPosition < Chunk->GetNotDefaultRecordsCount(); ++NextLocalPosition) { + NextGlobalPosition = CommonShift + Chunk->GetIndexUnsafeFast(NextLocalPosition); + if (NextGlobalPosition == index) { + writer.AddRealData(Chunk->GetColValue(), NextLocalPosition); + found = true; + } else if (index < NextGlobalPosition) { + return found; + } + } + NextGlobalPosition = CommonShift + Chunk->GetRecordsCount(); + return false; +} + +void TSparsedMerger::TCursor::InitArrays(const ui32 position) { + if (!CurrentOwnedArray || CurrentOwnedArray->GetFinishPosition() <= position) { + CurrentOwnedArray = Array->GetArray({}, position, Array); + if (CurrentOwnedArray->GetArray()->GetType() == NArrow::NAccessor::IChunkedArray::EType::SparsedArray) { + CurrentSparsedArray = static_pointer_cast(CurrentOwnedArray->GetArray()); + } else { + CurrentSparsedArray = make_shared(*CurrentOwnedArray->GetArray(), Context.GetDefaultValue()); + } + Chunk.reset(); + } + if (!Chunk || Chunk->GetFinishPosition() <= position) { + Chunk = CurrentSparsedArray->GetSparsedChunk(position - CurrentOwnedArray->GetStartPosition()); + AFL_VERIFY(Chunk->GetRecordsCount()); + AFL_VERIFY(CurrentOwnedArray->GetStartPosition() + Chunk->GetStartPosition() <= position && position < CurrentOwnedArray->GetStartPosition() + Chunk->GetFinishPosition()) + ("pos", position)("start", Chunk->GetStartPosition())("finish", Chunk->GetFinishPosition()) + ("shift", CurrentOwnedArray->GetStartPosition()); + } + CommonShift = CurrentOwnedArray->GetStartPosition() + Chunk->GetStartPosition(); + NextGlobalPosition = CurrentOwnedArray->GetStartPosition() + Chunk->GetFirstIndexNotDefault(); + NextLocalPosition = 0; +} + +} // namespace NKikimr::NOlap::NCompaction diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.h b/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.h new file mode 100644 index 000000000000..504b890749e1 --- /dev/null +++ b/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.h @@ -0,0 +1,77 @@ +#pragma once +#include +#include +#include +#include + +namespace NKikimr::NOlap::NCompaction { +class TSparsedMerger: public IColumnMerger { +private: + static inline auto Registrator = TFactory::TRegistrator(NArrow::NAccessor::TGlobalConst::SparsedDataAccessorName); + + using TBase = IColumnMerger; + class TWriter: public TColumnPortionResult { + private: + using TBase = TColumnPortionResult; + const std::shared_ptr DataType; + const TColumnMergeContext& Context; + std::unique_ptr IndexBuilder; + std::unique_ptr ValueBuilder; + arrow::UInt32Builder* IndexBuilderImpl = nullptr; + ui32 CurrentRecordIdx = 0; + ui32 UsefulRecordsCount = 0; + + public: + TWriter(const TColumnMergeContext& context); + + bool HasData() const { + return CurrentRecordIdx; + } + + bool HasUsefulData() const { + return UsefulRecordsCount; + } + + ui32 AddPosition() { + return ++CurrentRecordIdx; + } + + void AddRealData(const std::shared_ptr& arr, const ui32 index); + + TColumnPortionResult Flush(); + }; + + class TCursor { + private: + std::shared_ptr Array; + std::optional CurrentOwnedArray; + std::shared_ptr CurrentSparsedArray; + ui32 NextGlobalPosition = 0; + ui32 NextLocalPosition = 0; + ui32 CommonShift = 0; + std::optional Chunk; + const TColumnMergeContext& Context; + void InitArrays(const ui32 position); + + public: + TCursor(const std::shared_ptr& array, const TColumnMergeContext& context) + : Array(array) + , Context(context) { + AFL_VERIFY(Array->GetRecordsCount()); + InitArrays(0); + } + + bool AddIndexTo(const ui32 index, TWriter& writer); + }; + + std::vector Cursors; + virtual void DoStart(const std::vector>& input) override; + + virtual std::vector DoExecute( + const TChunkMergeContext& context, const arrow::UInt16Array& pIdxArray, const arrow::UInt32Array& pRecordIdxArray) override; + +public: + using TBase::TBase; +}; + +} // namespace NKikimr::NOlap::NCompaction diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/ya.make b/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/ya.make new file mode 100644 index 000000000000..e24e8341aa7d --- /dev/null +++ b/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/ya.make @@ -0,0 +1,11 @@ +LIBRARY() + +SRCS( + GLOBAL logic.cpp +) + +PEERDIR( + ydb/core/tx/columnshard/engines/changes/compaction/common +) + +END() diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/ya.make b/ydb/core/tx/columnshard/engines/changes/compaction/ya.make index c6a7bc101f9a..5e76aa0d8971 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/ya.make +++ b/ydb/core/tx/columnshard/engines/changes/compaction/ya.make @@ -7,8 +7,9 @@ SRCS( PEERDIR( ydb/core/tx/tiering ydb/core/tx/columnshard/engines/changes/compaction/abstract - ydb/core/tx/columnshard/engines/changes/compaction/plain ydb/core/tx/columnshard/engines/changes/compaction/common + ydb/core/tx/columnshard/engines/changes/compaction/plain + ydb/core/tx/columnshard/engines/changes/compaction/sparsed ) END() diff --git a/ydb/core/tx/columnshard/engines/changes/general_compaction.cpp b/ydb/core/tx/columnshard/engines/changes/general_compaction.cpp index bfbd1eac5133..e78f71919009 100644 --- a/ydb/core/tx/columnshard/engines/changes/general_compaction.cpp +++ b/ydb/core/tx/columnshard/engines/changes/general_compaction.cpp @@ -81,7 +81,7 @@ void TGeneralCompactColumnEngineChanges::BuildAppendedPortionsByChunks( auto resultSchema = context.SchemaVersions.GetLastSchema(); auto shardingActual = context.SchemaVersions.GetShardingInfoActual(GranuleMeta->GetPathId()); - std::shared_ptr stats = std::make_shared(); + std::shared_ptr stats = std::make_shared(); std::shared_ptr resultFiltered; NCompaction::TMerger merger(context, SaverContext); { @@ -115,12 +115,11 @@ void TGeneralCompactColumnEngineChanges::BuildAppendedPortionsByChunks( pkColumnIds.emplace((ui32)IIndexInfo::ESpecialColumn::DELETE_FLAG); } } - resultFiltered = std::make_shared(resultSchema, dataColumnIds); { auto seqDataColumnIds = dataColumnIds; for (auto&& i : pkColumnIds) { - AFL_VERIFY(seqDataColumnIds.erase(i)); + AFL_VERIFY(seqDataColumnIds.erase(i))("id", i); } THashSet usedPortionIds; for (auto&& i : portions) { diff --git a/ydb/core/tx/columnshard/engines/changes/indexation.cpp b/ydb/core/tx/columnshard/engines/changes/indexation.cpp index 4218409e29d6..a2c8f38b91b8 100644 --- a/ydb/core/tx/columnshard/engines/changes/indexation.cpp +++ b/ydb/core/tx/columnshard/engines/changes/indexation.cpp @@ -1,6 +1,7 @@ #include "indexation.h" #include "compaction/merger.h" + #include namespace NKikimr::NOlap { @@ -50,17 +51,69 @@ namespace { class TBatchInfo { private: YDB_READONLY_DEF(std::shared_ptr, Batch); - const NEvWrite::EModificationType ModificationType; + public: TBatchInfo(const std::shared_ptr& batch, const NEvWrite::EModificationType modificationType) - : Batch(batch) - , ModificationType(modificationType) + : Batch(batch) { + } +}; + +class TPathFieldsInfo { +private: + std::set UsageColumnIds; + const ISnapshotSchema::TPtr ResultSchema; + THashMap Schemas; + bool Finished = false; + const ui32 FullColumnsCount; + +public: + TPathFieldsInfo(const ISnapshotSchema::TPtr& resultSchema) + : UsageColumnIds(IIndexInfo::GetNecessarySystemColumnIdsSet()) + , ResultSchema(resultSchema) + , FullColumnsCount(ResultSchema->GetIndexInfo().GetColumnIds(true).size()) { + AFL_VERIFY(FullColumnsCount); + } + + bool IsFinished() const { + return Finished; + } + + bool HasDeletion() const { + AFL_VERIFY(Finished); + return UsageColumnIds.contains((ui32)IIndexInfo::ESpecialColumn::DELETE_FLAG); + } + + void Finish() { + AFL_VERIFY(UsageColumnIds.size()); + AFL_VERIFY(!Finished); + Finished = true; + if (UsageColumnIds.size() == FullColumnsCount) { + return; + } + auto defaultDiffs = ISnapshotSchema::GetColumnsWithDifferentDefaults(Schemas, ResultSchema); + UsageColumnIds.insert(defaultDiffs.begin(), defaultDiffs.end()); + } + const std::set& GetUsageColumnIds() const { + AFL_VERIFY(Finished); + return UsageColumnIds; } - bool GetIsDeletion() const { - return ModificationType == NEvWrite::EModificationType::Delete; + void AddChunkInfo(const TInsertedData& data, const TConstructionContext& context) { + AFL_VERIFY(!Finished); + if (UsageColumnIds.size() == FullColumnsCount) { + return; + } + auto blobSchema = context.SchemaVersions.GetSchemaVerified(data.GetSchemaVersion()); + if (!Schemas.contains(data.GetSchemaVersion())) { + Schemas.emplace(data.GetSchemaVersion(), blobSchema); + } + std::vector filteredIds = data.GetMeta().GetSchemaSubset().Apply(blobSchema->GetIndexInfo().GetColumnIds(false)); + if (data.GetMeta().GetModificationType() == NEvWrite::EModificationType::Delete) { + filteredIds.emplace_back((ui32)IIndexInfo::ESpecialColumn::DELETE_FLAG); + } + UsageColumnIds.insert(filteredIds.begin(), filteredIds.end()); } }; @@ -68,12 +121,20 @@ class TPathData { private: std::vector Batches; YDB_READONLY_DEF(std::optional, ShardingInfo); - bool HasDeletionFlag = false; + TPathFieldsInfo ColumnsInfo; + public: - TPathData(const std::optional& shardingInfo) + TPathData(const std::optional& shardingInfo, const ISnapshotSchema::TPtr& resultSchema) : ShardingInfo(shardingInfo) - { - + , ColumnsInfo(resultSchema) { + } + + const TPathFieldsInfo& GetColumnsInfo() const { + return ColumnsInfo; + } + + void FinishChunksInfo() { + ColumnsInfo.Finish(); } std::vector> GetGeneralContainers() const { @@ -84,14 +145,16 @@ class TPathData { return result; } + void AddChunkInfo(const NOlap::TInsertedData& data, const TConstructionContext& context) { + ColumnsInfo.AddChunkInfo(data, context); + } + bool HasDeletion() { - return HasDeletionFlag; + return ColumnsInfo.HasDeletion(); } void AddBatch(const NOlap::TInsertedData& data, const std::shared_ptr& batch) { - if (data.GetMeta().GetModificationType() == NEvWrite::EModificationType::Delete) { - HasDeletionFlag = true; - } + AFL_VERIFY(ColumnsInfo.IsFinished()); AFL_VERIFY(batch); Batches.emplace_back(batch, data.GetMeta().GetModificationType()); } @@ -108,23 +171,47 @@ class TPathData { class TPathesData { private: THashMap Data; + const ISnapshotSchema::TPtr ResultSchema; public: + TPathesData(const ISnapshotSchema::TPtr& resultSchema) + : ResultSchema(resultSchema) { + } + + void FinishChunksInfo() { + for (auto&& i : Data) { + i.second.FinishChunksInfo(); + } + } + const THashMap& GetData() const { return Data; } - void Add(const NOlap::TInsertedData& inserted, const std::optional& info, - const std::shared_ptr& batch) { + void AddChunkInfo(const NOlap::TInsertedData& inserted, const TConstructionContext& context) { + auto shardingFilterCommit = context.SchemaVersions.GetShardingInfoOptional(inserted.PathId, inserted.GetSnapshot()); auto it = Data.find(inserted.PathId); if (it == Data.end()) { - it = Data.emplace(inserted.PathId, info).first; + it = Data.emplace(inserted.PathId, TPathData(shardingFilterCommit, ResultSchema)).first; } - it->second.AddShardingInfo(info); + it->second.AddChunkInfo(inserted, context); + it->second.AddShardingInfo(shardingFilterCommit); + } + + void AddBatch(const NOlap::TInsertedData& inserted, const std::shared_ptr& batch) { + auto it = Data.find(inserted.PathId); + AFL_VERIFY(it != Data.end()); it->second.AddBatch(inserted, batch); } + + const TPathFieldsInfo& GetPathInfo(const ui64 pathId) const { + auto it = Data.find(pathId); + AFL_VERIFY(it != Data.end()); + return it->second.GetColumnsInfo(); + } }; -} + +} // namespace TConclusionStatus TInsertColumnEngineChanges::DoConstructBlobs(TConstructionContext& context) noexcept { Y_ABORT_UNLESS(!DataToIndex.empty()); @@ -143,34 +230,15 @@ TConclusionStatus TInsertColumnEngineChanges::DoConstructBlobs(TConstructionCont auto resultSchema = context.SchemaVersions.GetSchema(maxSnapshot); Y_ABORT_UNLESS(resultSchema->GetIndexInfo().IsSorted()); - TPathesData pathBatches; - std::set usageColumnIds; - { - THashMap schemas; - for (auto& inserted : DataToIndex) { - if (schemas.contains(inserted.GetSchemaVersion())) { - continue; - } - schemas.emplace(inserted.GetSchemaVersion(), context.SchemaVersions.GetSchemaVerified(inserted.GetSchemaVersion())); - } - usageColumnIds = ISnapshotSchema::GetColumnsWithDifferentDefaults(schemas, resultSchema); - } - + TPathesData pathBatches(resultSchema); for (auto& inserted : DataToIndex) { - auto blobSchema = context.SchemaVersions.GetSchemaVerified(inserted.GetSchemaVersion()); - std::vector filteredIds = inserted.GetMeta().GetSchemaSubset().Apply(blobSchema->GetIndexInfo().GetColumnIds(false)); - usageColumnIds.insert(filteredIds.begin(), filteredIds.end()); - if (inserted.GetMeta().GetModificationType() == NEvWrite::EModificationType::Delete) { - usageColumnIds.emplace((ui32)IIndexInfo::ESpecialColumn::DELETE_FLAG); - } - if (usageColumnIds.size() == resultSchema->GetIndexInfo().GetColumnIds(true).size()) { - break; - } + pathBatches.AddChunkInfo(inserted, context); } + pathBatches.FinishChunksInfo(); + for (auto& inserted : DataToIndex) { const TBlobRange& blobRange = inserted.GetBlobRange(); - auto shardingFilterCommit = context.SchemaVersions.GetShardingInfoOptional(inserted.PathId, inserted.GetSnapshot()); auto blobSchema = context.SchemaVersions.GetSchemaVerified(inserted.GetSchemaVersion()); std::shared_ptr batch; @@ -180,22 +248,23 @@ TConclusionStatus TInsertColumnEngineChanges::DoConstructBlobs(TConstructionCont std::make_shared(inserted.GetMeta().GetSchemaSubset().Apply(blobSchema->GetIndexInfo().ArrowSchema()->fields())); batch = std::make_shared(NArrow::DeserializeBatch(blobData, batchSchema)); } - IIndexInfo::AddSnapshotColumns(*batch, inserted.GetSnapshot()); - if (usageColumnIds.contains((ui32)IIndexInfo::ESpecialColumn::DELETE_FLAG)) { + + auto& pathInfo = pathBatches.GetPathInfo(inserted.PathId); + + if (pathInfo.HasDeletion()) { IIndexInfo::AddDeleteFlagsColumn(*batch, inserted.GetMeta().GetModificationType() == NEvWrite::EModificationType::Delete); } - usageColumnIds.insert(IIndexInfo::GetSnapshotColumnIds().begin(), IIndexInfo::GetSnapshotColumnIds().end()); - batch = resultSchema->NormalizeBatch(*blobSchema, batch, usageColumnIds).DetachResult(); - pathBatches.Add(inserted, shardingFilterCommit, batch); + batch = resultSchema->NormalizeBatch(*blobSchema, batch, pathInfo.GetUsageColumnIds()).DetachResult(); + pathBatches.AddBatch(inserted, batch); } Y_ABORT_UNLESS(Blobs.IsEmpty()); - auto filteredSnapshot = std::make_shared(resultSchema, usageColumnIds); - auto stats = std::make_shared(); + auto stats = std::make_shared(); std::vector> filters; for (auto& [pathId, pathInfo] : pathBatches.GetData()) { + auto filteredSnapshot = std::make_shared(resultSchema, pathInfo.GetColumnsInfo().GetUsageColumnIds()); std::optional shardingVersion; if (pathInfo.GetShardingInfo()) { shardingVersion = pathInfo.GetShardingInfo()->GetSnapshotVersion(); @@ -221,4 +290,4 @@ NColumnShard::ECumulativeCounters TInsertColumnEngineChanges::GetCounterIndex(co return isSuccess ? NColumnShard::COUNTER_INDEXING_SUCCESS : NColumnShard::COUNTER_INDEXING_FAIL; } -} +} // namespace NKikimr::NOlap diff --git a/ydb/core/tx/columnshard/engines/changes/indexation.h b/ydb/core/tx/columnshard/engines/changes/indexation.h index d130612b7451..a93bfd4dec6c 100644 --- a/ydb/core/tx/columnshard/engines/changes/indexation.h +++ b/ydb/core/tx/columnshard/engines/changes/indexation.h @@ -10,7 +10,7 @@ namespace NKikimr::NOlap { class TInsertColumnEngineChanges: public TChangesWithAppend { private: using TBase = TChangesWithAppend; - std::vector DataToIndex; + std::vector DataToIndex; protected: virtual void DoWriteIndexOnComplete(NColumnShard::TColumnShard* self, TWriteIndexCompleteContext& context) override; virtual void DoWriteIndexOnExecute(NColumnShard::TColumnShard* self, TWriteIndexContext& context) override; diff --git a/ydb/core/tx/columnshard/engines/portions/column_record.cpp b/ydb/core/tx/columnshard/engines/portions/column_record.cpp index e4fbef70c5fc..6127ad439326 100644 --- a/ydb/core/tx/columnshard/engines/portions/column_record.cpp +++ b/ydb/core/tx/columnshard/engines/portions/column_record.cpp @@ -28,7 +28,7 @@ TChunkMeta::TChunkMeta(const TColumnChunkLoadContext& context, const TSimpleColu DeserializeFromProto(context.GetAddress(), context.GetMetaProto(), columnInfo).Validate(); } -TChunkMeta::TChunkMeta(const std::shared_ptr& column, const TSimpleColumnInfo& columnInfo) +TChunkMeta::TChunkMeta(const std::shared_ptr& column, const TSimpleColumnInfo& columnInfo) : TBase(column, columnInfo.GetNeedMinMax(), columnInfo.GetIsSorted()) { } @@ -52,7 +52,8 @@ TColumnRecord::TColumnRecord(const TBlobRangeLink16::TLinkId blobLinkId, const T { } -TColumnRecord::TColumnRecord(const TChunkAddress& address, const std::shared_ptr& column, const TSimpleColumnInfo& columnInfo) +TColumnRecord::TColumnRecord( + const TChunkAddress& address, const std::shared_ptr& column, const TSimpleColumnInfo& columnInfo) : Meta(column, columnInfo) , ColumnId(address.GetColumnId()) , Chunk(address.GetChunk()) diff --git a/ydb/core/tx/columnshard/engines/portions/column_record.h b/ydb/core/tx/columnshard/engines/portions/column_record.h index 0109b372f302..2b984e4f05f3 100644 --- a/ydb/core/tx/columnshard/engines/portions/column_record.h +++ b/ydb/core/tx/columnshard/engines/portions/column_record.h @@ -2,20 +2,19 @@ #include "common.h" -#include - +#include +#include #include #include -#include -#include +#include #include +#include #include -#include #include #include - +#include #include namespace NKikimrColumnShardDataSharingProto { @@ -31,16 +30,17 @@ struct TChunkMeta: public TSimpleChunkMeta { private: using TBase = TSimpleChunkMeta; TChunkMeta() = default; - [[nodiscard]] TConclusionStatus DeserializeFromProto(const TChunkAddress& address, const NKikimrTxColumnShard::TIndexColumnMeta& proto, const TSimpleColumnInfo& columnInfo); + [[nodiscard]] TConclusionStatus DeserializeFromProto( + const TChunkAddress& address, const NKikimrTxColumnShard::TIndexColumnMeta& proto, const TSimpleColumnInfo& columnInfo); friend class TColumnRecord; + public: TChunkMeta(TSimpleChunkMeta&& baseMeta) - : TBase(baseMeta) - { - + : TBase(baseMeta) { } - [[nodiscard]] static TConclusion BuildFromProto(const TChunkAddress& address, const NKikimrTxColumnShard::TIndexColumnMeta& proto, const TSimpleColumnInfo& columnInfo) { + [[nodiscard]] static TConclusion BuildFromProto( + const TChunkAddress& address, const NKikimrTxColumnShard::TIndexColumnMeta& proto, const TSimpleColumnInfo& columnInfo) { TChunkMeta result; auto parse = result.DeserializeFromProto(address, proto, columnInfo); if (!parse) { @@ -63,20 +63,19 @@ struct TChunkMeta: public TSimpleChunkMeta { TChunkMeta(const TColumnChunkLoadContext& context, const TSimpleColumnInfo& columnInfo); - TChunkMeta(const std::shared_ptr& column, const TSimpleColumnInfo& columnInfo); + TChunkMeta(const std::shared_ptr& column, const TSimpleColumnInfo& columnInfo); }; class TColumnRecord { private: TChunkMeta Meta; TColumnRecord(TChunkMeta&& meta) - : Meta(std::move(meta)) - { - + : Meta(std::move(meta)) { } TColumnRecord() = default; TConclusionStatus DeserializeFromProto(const NKikimrColumnShardDataSharingProto::TColumnRecord& proto, const TSimpleColumnInfo& columnInfo); + public: ui32 ColumnId = 0; ui16 Chunk = 0; @@ -99,9 +98,7 @@ class TColumnRecord { : Meta(std::move(meta)) , ColumnId(address.GetColumnId()) , Chunk(address.GetChunk()) - , BlobRange(range) - { - + , BlobRange(range) { } class TTestInstanceBuilder { @@ -116,7 +113,7 @@ class TColumnRecord { } }; - ui32 GetColumnId() const { + ui32 GetColumnId() const { return ColumnId; } ui16 GetChunkIdx() const { @@ -127,7 +124,8 @@ class TColumnRecord { } NKikimrColumnShardDataSharingProto::TColumnRecord SerializeToProto() const; - static TConclusion BuildFromProto(const NKikimrColumnShardDataSharingProto::TColumnRecord& proto, const TSimpleColumnInfo& columnInfo) { + static TConclusion BuildFromProto( + const NKikimrColumnShardDataSharingProto::TColumnRecord& proto, const TSimpleColumnInfo& columnInfo) { TColumnRecord result; auto parse = result.DeserializeFromProto(proto, columnInfo); if (!parse) { @@ -136,14 +134,14 @@ class TColumnRecord { return result; } - TColumnSerializationStat GetSerializationStat(const std::string& columnName) const { - TColumnSerializationStat result(ColumnId, columnName); + NArrow::NSplitter::TColumnSerializationStat GetSerializationStat(const std::string& columnName) const { + NArrow::NSplitter::TColumnSerializationStat result(ColumnId, columnName); result.Merge(GetSerializationStat()); return result; } - TSimpleSerializationStat GetSerializationStat() const { - return TSimpleSerializationStat(BlobRange.Size, Meta.GetNumRows(), Meta.GetRawBytes()); + NArrow::NSplitter::TSimpleSerializationStat GetSerializationStat() const { + return NArrow::NSplitter::TSimpleSerializationStat(BlobRange.Size, Meta.GetNumRows(), Meta.GetRawBytes()); } const TChunkMeta& GetMeta() const { @@ -163,18 +161,17 @@ class TColumnRecord { } TString DebugString() const { - return TStringBuilder() - << "column_id:" << ColumnId << ";" - << "chunk_idx:" << Chunk << ";" - << "blob_range:" << BlobRange.ToString() << ";" - ; + return TStringBuilder() << "column_id:" << ColumnId << ";" + << "chunk_idx:" << Chunk << ";" + << "blob_range:" << BlobRange.ToString() << ";"; } - TColumnRecord(const TChunkAddress& address, const std::shared_ptr& column, const TSimpleColumnInfo& columnInfo); + TColumnRecord( + const TChunkAddress& address, const std::shared_ptr& column, const TSimpleColumnInfo& columnInfo); TColumnRecord(const TBlobRangeLink16::TLinkId blobLinkId, const TColumnChunkLoadContext& loadContext, const TSimpleColumnInfo& columnInfo); - friend IOutputStream& operator << (IOutputStream& out, const TColumnRecord& rec) { + friend IOutputStream& operator<<(IOutputStream& out, const TColumnRecord& rec) { out << '{'; if (rec.Chunk) { out << 'n' << rec.Chunk; @@ -186,52 +183,4 @@ class TColumnRecord { } }; -class TSimpleOrderedColumnChunk: public IPortionColumnChunk { -private: - using TBase = IPortionColumnChunk; - const TColumnRecord ColumnRecord; - YDB_READONLY_DEF(TString, Data); -protected: - virtual TString DoDebugString() const override { - TStringBuilder sb; - sb << "column_id=" << GetColumnId() << ";data_size=" << Data.size() << ";"; - if (GetChunkIdxOptional()) { - sb << "chunk=" << GetChunkIdxVerified() << ";"; - } else { - sb << "chunk=NO_INITIALIZED;"; - } - return sb; - } - - virtual const TString& DoGetData() const override { - return Data; - } - virtual ui64 DoGetRawBytesImpl() const override { - return ColumnRecord.GetMeta().GetRawBytes(); - } - virtual ui32 DoGetRecordsCountImpl() const override { - return ColumnRecord.GetMeta().GetNumRows(); - } - virtual std::vector> DoInternalSplitImpl(const TColumnSaver& /*saver*/, const std::shared_ptr& /*counters*/, - const std::vector& /*splitSizes*/) const override { - Y_ABORT_UNLESS(false); - return {}; - } - virtual TSimpleChunkMeta DoBuildSimpleChunkMeta() const override { - return ColumnRecord.GetMeta(); - } - virtual std::shared_ptr DoGetFirstScalar() const override { - return nullptr; - } - virtual std::shared_ptr DoGetLastScalar() const override { - return nullptr; - } -public: - TSimpleOrderedColumnChunk(const TColumnRecord& cRecord, const TString& data) - : TBase(cRecord.ColumnId, cRecord.Chunk) - , ColumnRecord(cRecord) - , Data(data) { - } -}; - -} +} // namespace NKikimr::NOlap diff --git a/ydb/core/tx/columnshard/engines/portions/common.h b/ydb/core/tx/columnshard/engines/portions/common.h index 1231a1e9f5f3..3702887ccc81 100644 --- a/ydb/core/tx/columnshard/engines/portions/common.h +++ b/ydb/core/tx/columnshard/engines/portions/common.h @@ -1,7 +1,9 @@ #pragma once #include +#include namespace NKikimr::NOlap { +using TColumnSaver = NArrow::NAccessor::TColumnSaver; class TChunkAddress { private: diff --git a/ydb/core/tx/columnshard/engines/portions/index_chunk.h b/ydb/core/tx/columnshard/engines/portions/index_chunk.h index 1fe92adfb539..65e0855d5eef 100644 --- a/ydb/core/tx/columnshard/engines/portions/index_chunk.h +++ b/ydb/core/tx/columnshard/engines/portions/index_chunk.h @@ -6,7 +6,6 @@ #include #include -#include #include #include diff --git a/ydb/core/tx/columnshard/engines/portions/portion_info.cpp b/ydb/core/tx/columnshard/engines/portions/portion_info.cpp index b89b63e97b5c..0b4b3bc89a10 100644 --- a/ydb/core/tx/columnshard/engines/portions/portion_info.cpp +++ b/ydb/core/tx/columnshard/engines/portions/portion_info.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -631,30 +633,6 @@ TPortionInfo::TPreparedBatchData PrepareForAssembleImpl(const TPortionInfo& port } -namespace { -class TChunkAccessor { -private: - const std::vector& Chunks; - const std::shared_ptr Loader; -public: - TChunkAccessor(const std::vector& chunks, const std::shared_ptr& loader) - : Chunks(chunks) - , Loader(loader) - { - - } - ui64 GetChunksCount() const { - return Chunks.size(); - } - ui64 GetChunkLength(const ui32 idx) const { - return Chunks[idx].GetRecordsCount(); - } - std::shared_ptr GetArray(const ui32 idx) const { - return Chunks[idx].GetArrayVerified(Loader); - } -}; -} - ISnapshotSchema::TPtr TPortionInfo::TSchemaCursor::GetSchema(const TPortionInfoConstructor& portion) { if (!CurrentSchema || portion.GetMinSnapshotDeprecatedVerified() != LastSnapshot) { CurrentSchema = portion.GetSchema(VersionedIndex); @@ -664,12 +642,8 @@ ISnapshotSchema::TPtr TPortionInfo::TSchemaCursor::GetSchema(const TPortionInfoC return CurrentSchema; } -NArrow::NAccessor::IChunkedArray::TCurrentChunkAddress TDeserializeChunkedArray::DoGetChunk(const std::optional& chunkCurrent, const ui64 position) const { - TChunkAccessor accessor(Chunks, Loader); - return SelectChunk(chunkCurrent, position, accessor); -} - -TPortionInfo::TPreparedBatchData TPortionInfo::PrepareForAssemble(const ISnapshotSchema& dataSchema, const ISnapshotSchema& resultSchema, THashMap& blobsData) const { +TPortionInfo::TPreparedBatchData TPortionInfo::PrepareForAssemble( + const ISnapshotSchema& dataSchema, const ISnapshotSchema& resultSchema, THashMap& blobsData) const { return PrepareForAssembleImpl(*this, dataSchema, resultSchema, blobsData); } @@ -687,25 +661,18 @@ bool TPortionInfo::NeedShardingFilter(const TGranuleShardingInfo& shardingInfo) std::shared_ptr TPortionInfo::TPreparedColumn::AssembleAccessor() const { Y_ABORT_UNLESS(!Blobs.empty()); - std::vector> chunks; - chunks.reserve(Blobs.size()); + NArrow::NAccessor::TCompositeChunkedArray::TBuilder builder(GetField()->type()); for (auto& blob : Blobs) { - auto batch = blob.BuildRecordBatch(*Loader); - Y_ABORT_UNLESS(batch); - AFL_VERIFY(batch->num_columns() == 1); - chunks.emplace_back(batch->column(0)); - } - if (chunks.size() > 1) { - return std::make_shared(NArrow::TStatusValidator::GetValid(arrow::ChunkedArray::Make(chunks))); - } else { - return std::make_shared(chunks.front()); + auto chunkedArray = blob.BuildRecordBatch(*Loader); + builder.AddChunk(chunkedArray); } + return builder.Finish(); } -std::shared_ptr TPortionInfo::TPreparedColumn::AssembleForSeqAccess() const { +std::shared_ptr TPortionInfo::TPreparedColumn::AssembleForSeqAccess() const { Y_ABORT_UNLESS(!Blobs.empty()); - std::vector chunks; + std::vector chunks; chunks.reserve(Blobs.size()); ui64 recordsCount = 0; for (auto& blob : Blobs) { @@ -717,64 +684,31 @@ std::shared_ptr TPortionInfo::TPreparedColumn::Assembl } } - return std::make_shared(recordsCount, Loader, std::move(chunks)); -} - -std::shared_ptr TPortionInfo::TPreparedColumn::Assemble() const { - Y_ABORT_UNLESS(!Blobs.empty()); - - std::vector> chunks; - chunks.reserve(Blobs.size()); - for (auto& blob : Blobs) { - auto batch = blob.BuildRecordBatch(*Loader); - Y_ABORT_UNLESS(batch); - Y_ABORT_UNLESS(batch->num_columns() == 1); - chunks.emplace_back(batch->column(0)); - } - - return NArrow::TStatusValidator::GetValid(arrow::ChunkedArray::Make(chunks)); + return std::make_shared(recordsCount, Loader, std::move(chunks)); } -TDeserializeChunkedArray::TChunk TPortionInfo::TAssembleBlobInfo::BuildDeserializeChunk(const std::shared_ptr& loader) const { +NArrow::NAccessor::TDeserializeChunkedArray::TChunk TPortionInfo::TAssembleBlobInfo::BuildDeserializeChunk( + const std::shared_ptr& loader) const { if (DefaultRowsCount) { Y_ABORT_UNLESS(!Data); - AFL_VERIFY(loader->GetExpectedSchema()->num_fields() == 1); - auto col = NArrow::TThreadSimpleArraysCache::Get(loader->GetExpectedSchema()->field(0)->type(), DefaultValue, DefaultRowsCount); - return TDeserializeChunkedArray::TChunk(col); + auto col = std::make_shared( + NArrow::TThreadSimpleArraysCache::Get(loader->GetField()->type(), DefaultValue, DefaultRowsCount)); + return NArrow::NAccessor::TDeserializeChunkedArray::TChunk(col); } else { AFL_VERIFY(ExpectedRowsCount); - return TDeserializeChunkedArray::TChunk(*ExpectedRowsCount, Data); + return NArrow::NAccessor::TDeserializeChunkedArray::TChunk(*ExpectedRowsCount, Data); } } -std::shared_ptr TPortionInfo::TAssembleBlobInfo::BuildRecordBatch(const TColumnLoader& loader) const { +std::shared_ptr TPortionInfo::TAssembleBlobInfo::BuildRecordBatch(const TColumnLoader& loader) const { if (DefaultRowsCount) { Y_ABORT_UNLESS(!Data); - AFL_VERIFY(loader.GetExpectedSchema()->num_fields() == 1); - return arrow::RecordBatch::Make(loader.GetExpectedSchema(), DefaultRowsCount, - { NArrow::TThreadSimpleArraysCache::Get(loader.GetExpectedSchema()->field(0)->type(), DefaultValue, DefaultRowsCount) }); + return std::make_shared( + NArrow::TThreadSimpleArraysCache::Get(loader.GetField()->type(), DefaultValue, DefaultRowsCount)); } else { - auto result = loader.Apply(Data); - if (!result.ok()) { - AFL_ERROR(NKikimrServices::TX_COLUMNSHARD_SCAN)("event", "cannot unpack batch")("error", result.status().ToString())("loader", loader.DebugString()); - return nullptr; - } - if (ExpectedRowsCount) { - AFL_VERIFY((*result)->num_rows() == ExpectedRowsCount)("real", (*result)->num_rows())("expected", ExpectedRowsCount); - } - return *result; - } -} - -std::shared_ptr TPortionInfo::TPreparedBatchData::AssembleForSeqAccess() const { - std::vector> columns; - std::vector> fields; - for (auto&& i : Columns) { - columns.emplace_back(i.AssembleForSeqAccess()); - fields.emplace_back(i.GetField()); + AFL_VERIFY(ExpectedRowsCount); + return loader.ApplyVerified(Data, *ExpectedRowsCount); } - - return std::make_shared(fields, std::move(columns)); } std::shared_ptr TPortionInfo::TPreparedBatchData::AssembleToGeneralContainer( diff --git a/ydb/core/tx/columnshard/engines/portions/portion_info.h b/ydb/core/tx/columnshard/engines/portions/portion_info.h index b7b89c2f187d..1f1bd6ac045f 100644 --- a/ydb/core/tx/columnshard/engines/portions/portion_info.h +++ b/ydb/core/tx/columnshard/engines/portions/portion_info.h @@ -3,9 +3,11 @@ #include "index_chunk.h" #include "meta.h" +#include #include -#include +#include #include +#include #include #include #include @@ -28,60 +30,6 @@ struct TIndexInfo; class TVersionedIndex; class IDbWrapper; -class TDeserializeChunkedArray: public NArrow::NAccessor::IChunkedArray { -private: - using TBase = NArrow::NAccessor::IChunkedArray; -public: - class TChunk { - private: - YDB_READONLY(ui32, RecordsCount, 0); - std::shared_ptr PredefinedArray; - const TString Data; - public: - TChunk(const std::shared_ptr& predefinedArray) - : PredefinedArray(predefinedArray) { - AFL_VERIFY(PredefinedArray); - RecordsCount = PredefinedArray->length(); - } - - TChunk(const ui32 recordsCount, const TString& data) - : RecordsCount(recordsCount) - , Data(data) { - - } - - std::shared_ptr GetArrayVerified(const std::shared_ptr& loader) const { - if (PredefinedArray) { - return PredefinedArray; - } - auto result = loader->ApplyVerified(Data); - AFL_VERIFY(result); - AFL_VERIFY(result->num_columns() == 1); - AFL_VERIFY(result->num_rows() == RecordsCount)("length", result->num_rows())("records_count", RecordsCount); - return result->column(0); - } - }; - - std::shared_ptr Loader; - std::vector Chunks; -protected: - virtual std::optional DoGetRawSize() const override { - return {}; - } - virtual TCurrentChunkAddress DoGetChunk(const std::optional& chunkCurrent, const ui64 position) const override; - virtual std::shared_ptr DoGetChunkedArray() const override { - AFL_VERIFY(false); - return nullptr; - } -public: - TDeserializeChunkedArray(const ui64 recordsCount, const std::shared_ptr& loader, std::vector&& chunks) - : TBase(recordsCount, NArrow::NAccessor::IChunkedArray::EType::SerializedChunkedArray, loader->GetField()->type()) - , Loader(loader) - , Chunks(std::move(chunks)) { - AFL_VERIFY(Loader); - } -}; - class TEntityChunk { private: TChunkAddress Address; @@ -371,8 +319,8 @@ class TPortionInfo { return result; } - TSerializationStats GetSerializationStat(const ISnapshotSchema& schema) const { - TSerializationStats result; + NArrow::NSplitter::TSerializationStats GetSerializationStat(const ISnapshotSchema& schema) const { + NArrow::NSplitter::TSerializationStats result; for (auto&& i : Records) { if (schema.GetFieldByColumnIdOptional(i.ColumnId)) { result.AddStat(i.GetSerializationStat(schema.GetFieldByColumnIdVerified(i.ColumnId)->name())); @@ -656,8 +604,8 @@ class TPortionInfo { return DefaultRowsCount && !Data; } - std::shared_ptr BuildRecordBatch(const TColumnLoader& loader) const; - TDeserializeChunkedArray::TChunk BuildDeserializeChunk(const std::shared_ptr& loader) const; + std::shared_ptr BuildRecordBatch(const TColumnLoader& loader) const; + NArrow::NAccessor::TDeserializeChunkedArray::TChunk BuildDeserializeChunk(const std::shared_ptr& loader) const; }; class TPreparedColumn { @@ -670,22 +618,20 @@ class TPortionInfo { } const std::string& GetName() const { - return Loader->GetExpectedSchema()->field(0)->name(); + return Loader->GetField()->name(); } std::shared_ptr GetField() const { - return Loader->GetExpectedSchema()->field(0); + return Loader->GetField(); } TPreparedColumn(std::vector&& blobs, const std::shared_ptr& loader) : Loader(loader) , Blobs(std::move(blobs)) { - Y_ABORT_UNLESS(Loader); - Y_ABORT_UNLESS(Loader->GetExpectedSchema()->num_fields() == 1); + AFL_VERIFY(Loader); } - std::shared_ptr Assemble() const; - std::shared_ptr AssembleForSeqAccess() const; + std::shared_ptr AssembleForSeqAccess() const; std::shared_ptr AssembleAccessor() const; }; @@ -752,7 +698,6 @@ class TPortionInfo { } std::shared_ptr AssembleToGeneralContainer(const std::set& sequentialColumnIds) const; - std::shared_ptr AssembleForSeqAccess() const; }; class TColumnAssemblingInfo { diff --git a/ydb/core/tx/columnshard/engines/portions/read_with_blobs.cpp b/ydb/core/tx/columnshard/engines/portions/read_with_blobs.cpp index 14fa9f854dcc..ae85ef59842c 100644 --- a/ydb/core/tx/columnshard/engines/portions/read_with_blobs.cpp +++ b/ydb/core/tx/columnshard/engines/portions/read_with_blobs.cpp @@ -104,8 +104,8 @@ std::optional TReadPortionInfoWithBlobs::SyncP std::vector> newChunks; if (it != columnChunks.end()) { newChunks = to->GetIndexInfo().ActualizeColumnData(it->second, from->GetIndexInfo(), i); + AFL_VERIFY(entityChunksNew.emplace(i, std::move(newChunks)).second); } - AFL_VERIFY(entityChunksNew.emplace(i, std::move(newChunks)).second); } TPortionInfoConstructor constructor(source.PortionInfo, false, true); @@ -120,7 +120,7 @@ std::optional TReadPortionInfoWithBlobs::SyncP } const NSplitter::TEntityGroups groups = to->GetIndexInfo().GetEntityGroupsByStorageId(targetTier, *storages); - auto schemaTo = std::make_shared(to, std::make_shared()); + auto schemaTo = std::make_shared(to, std::make_shared()); TGeneralSerializedSlice slice(secondaryData.GetExternalData(), schemaTo, counters); return TWritePortionInfoWithBlobsConstructor::BuildByBlobs( diff --git a/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetched_data.cpp b/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetched_data.cpp index e647c77313e7..ac7fe2c16bf3 100644 --- a/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetched_data.cpp +++ b/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetched_data.cpp @@ -1,7 +1,8 @@ #include "fetched_data.h" -#include + +#include #include -#include +#include namespace NKikimr::NOlap { @@ -11,10 +12,10 @@ void TFetchedData::SyncTableColumns(const std::vectorAddField(i, std::make_shared( - NArrow::TThreadSimpleArraysCache::Get(i->type(), schema.GetExternalDefaultValueVerified(i->name()), Table->num_rows()))) + ->AddField(i, std::make_shared(NArrow::TThreadSimpleArraysCache::Get( + i->type(), schema.GetExternalDefaultValueVerified(i->name()), Table->num_rows()))) .Validate(); } } -} +} // namespace NKikimr::NOlap diff --git a/ydb/core/tx/columnshard/engines/scheme/abstract/index_info.h b/ydb/core/tx/columnshard/engines/scheme/abstract/index_info.h index 43b337c1f120..e54bcd80b2ab 100644 --- a/ydb/core/tx/columnshard/engines/scheme/abstract/index_info.h +++ b/ydb/core/tx/columnshard/engines/scheme/abstract/index_info.h @@ -1,15 +1,19 @@ #pragma once -#include "loader.h" #include +#include +#include #include #include namespace NKikimr::NOlap { +using TColumnLoader = NArrow::NAccessor::TColumnLoader; +using TColumnSaver = NArrow::NAccessor::TColumnSaver; + class IIndexInfo { public: - enum class ESpecialColumn: ui32 { + enum class ESpecialColumn : ui32 { PLAN_STEP = NOlap::NPortion::TSpecialColumns::SPEC_COL_PLAN_STEP_INDEX, TX_ID = NOlap::NPortion::TSpecialColumns::SPEC_COL_TX_ID_INDEX, DELETE_FLAG = NOlap::NPortion::TSpecialColumns::SPEC_COL_DELETE_FLAG_INDEX @@ -17,7 +21,7 @@ class IIndexInfo { using TSystemColumnsSet = ui64; - enum class ESystemColumnsSet: ui64 { + enum class ESystemColumnsSet : ui64 { Snapshot = 1, Deletion = 1 << 1, }; @@ -30,6 +34,11 @@ class IIndexInfo { return SPEC_COL_DELETE_FLAG; } + static const std::set& GetNecessarySystemColumnIdsSet() { + static const std::set result = { (ui32)ESpecialColumn::PLAN_STEP, (ui32)ESpecialColumn::TX_ID }; + return result; + } + static const std::vector& GetSnapshotColumnNames() { static const std::vector result = { std::string(SPEC_COL_PLAN_STEP), std::string(SPEC_COL_TX_ID) }; return result; @@ -87,12 +96,14 @@ class IIndexInfo { } static const std::vector& GetSystemColumnNames() { - static const std::vector result = { std::string(SPEC_COL_PLAN_STEP), std::string(SPEC_COL_TX_ID), std::string(SPEC_COL_DELETE_FLAG) }; + static const std::vector result = { std::string(SPEC_COL_PLAN_STEP), std::string(SPEC_COL_TX_ID), + std::string(SPEC_COL_DELETE_FLAG) }; return result; } static const std::vector& GetSystemColumnIds() { - static const std::vector result = { (ui32)ESpecialColumn::PLAN_STEP, (ui32)ESpecialColumn::TX_ID, (ui32)ESpecialColumn::DELETE_FLAG }; + static const std::vector result = { (ui32)ESpecialColumn::PLAN_STEP, (ui32)ESpecialColumn::TX_ID, + (ui32)ESpecialColumn::DELETE_FLAG }; return result; } @@ -130,17 +141,14 @@ class IIndexInfo { } static std::shared_ptr ArrowSchemaSnapshot() { - static std::shared_ptr result = std::make_shared(arrow::FieldVector{ - arrow::field(SPEC_COL_PLAN_STEP, arrow::uint64()), - arrow::field(SPEC_COL_TX_ID, arrow::uint64()) - }); + static std::shared_ptr result = std::make_shared( + arrow::FieldVector{ arrow::field(SPEC_COL_PLAN_STEP, arrow::uint64()), arrow::field(SPEC_COL_TX_ID, arrow::uint64()) }); return result; } static std::shared_ptr ArrowSchemaDeletion() { - static std::shared_ptr result = std::make_shared(arrow::FieldVector{ - arrow::field(SPEC_COL_DELETE_FLAG, arrow::boolean()) - }); + static std::shared_ptr result = + std::make_shared(arrow::FieldVector{ arrow::field(SPEC_COL_DELETE_FLAG, arrow::boolean()) }); return result; } @@ -149,15 +157,12 @@ class IIndexInfo { } static bool IsSpecialColumn(const std::string& fieldName) { - return fieldName == SPEC_COL_PLAN_STEP - || fieldName == SPEC_COL_TX_ID - || fieldName == SPEC_COL_DELETE_FLAG; + return fieldName == SPEC_COL_PLAN_STEP || fieldName == SPEC_COL_TX_ID || fieldName == SPEC_COL_DELETE_FLAG; } static bool IsSpecialColumn(const ui32 fieldId) { - return fieldId == (ui32)ESpecialColumn::PLAN_STEP - || fieldId == (ui32)ESpecialColumn::TX_ID - || fieldId == (ui32)ESpecialColumn::DELETE_FLAG; + return fieldId == (ui32)ESpecialColumn::PLAN_STEP || fieldId == (ui32)ESpecialColumn::TX_ID || + fieldId == (ui32)ESpecialColumn::DELETE_FLAG; } static bool IsNullableVerified(const ui32 fieldId) { @@ -182,4 +187,4 @@ class IIndexInfo { virtual ~IIndexInfo() = default; }; -} // namespace NKikimr::NOlap +} // namespace NKikimr::NOlap diff --git a/ydb/core/tx/columnshard/engines/scheme/abstract/loader.cpp b/ydb/core/tx/columnshard/engines/scheme/abstract/loader.cpp deleted file mode 100644 index d74dc491519d..000000000000 --- a/ydb/core/tx/columnshard/engines/scheme/abstract/loader.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include "loader.h" -#include - -namespace NKikimr::NOlap { - -TString TColumnLoader::DebugString() const { - TStringBuilder result; - if (ExpectedSchema) { - result << "schema:" << ExpectedSchema->ToString() << ";"; - } - if (Transformer) { - result << "transformer:" << Transformer->DebugString() << ";"; - } - if (Serializer) { - result << "serializer:" << Serializer->DebugString() << ";"; - } - return result; -} - -TColumnLoader::TColumnLoader(NArrow::NTransformation::ITransformer::TPtr transformer, const NArrow::NSerialization::TSerializerContainer& serializer, - const std::shared_ptr& expectedSchema, const std::shared_ptr& defaultValue, const ui32 columnId) - : Transformer(transformer) - , Serializer(serializer) - , ExpectedSchema(expectedSchema) - , DefaultValue(defaultValue) - , ColumnId(columnId) { - Y_ABORT_UNLESS(ExpectedSchema); - auto fieldsCountStr = ::ToString(ExpectedSchema->num_fields()); - Y_ABORT_UNLESS(ExpectedSchema->num_fields() == 1, "%s", fieldsCountStr.data()); - Y_ABORT_UNLESS(Serializer); -} - -const std::shared_ptr& TColumnLoader::GetField() const { - return ExpectedSchema->field(0); -} - -arrow::Result> TColumnLoader::Apply(const TString& data) const { - Y_ABORT_UNLESS(Serializer); - arrow::Result> columnArray = - Transformer ? Serializer->Deserialize(data) : Serializer->Deserialize(data, ExpectedSchema); - if (!columnArray.ok()) { - return columnArray; - } - if (Transformer) { - return Transformer->Transform(*columnArray); - } else { - return columnArray; - } -} - -std::shared_ptr TColumnLoader::ApplyVerified(const TString& data) const { - return NArrow::TStatusValidator::GetValid(Apply(data)); -} - -std::shared_ptr TColumnLoader::ApplyVerifiedColumn(const TString& data) const { - auto rb = ApplyVerified(data); - AFL_VERIFY(rb->num_columns() == 1)("schema", rb->schema()->ToString()); - return rb->column(0); -} - -} \ No newline at end of file diff --git a/ydb/core/tx/columnshard/engines/scheme/abstract/loader.h b/ydb/core/tx/columnshard/engines/scheme/abstract/loader.h deleted file mode 100644 index d128caaecb58..000000000000 --- a/ydb/core/tx/columnshard/engines/scheme/abstract/loader.h +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once -#include -#include -#include -#include - -namespace NKikimr::NOlap { - -class TColumnLoader { -private: - NArrow::NTransformation::ITransformer::TPtr Transformer; - NArrow::NSerialization::TSerializerContainer Serializer; - YDB_READONLY_DEF(std::shared_ptr, ExpectedSchema); - YDB_READONLY_DEF(std::shared_ptr, DefaultValue); - const ui32 ColumnId; -public: - bool IsEqualTo(const TColumnLoader& item) const { - if (!!Transformer != !!item.Transformer) { - return false; - } else if (!!Transformer && !Transformer->IsEqualTo(*item.Transformer)) { - return false; - } - if (!Serializer.IsEqualTo(item.Serializer)) { - return false; - } - return true; - } - - TString DebugString() const; - - TColumnLoader(NArrow::NTransformation::ITransformer::TPtr transformer, const NArrow::NSerialization::TSerializerContainer& serializer, - const std::shared_ptr& expectedSchema, const std::shared_ptr& defaultValue, const ui32 columnId); - - ui32 GetColumnId() const { - return ColumnId; - } - - const std::shared_ptr& GetField() const; - - arrow::Result> Apply(const TString& data) const; - - std::shared_ptr ApplyVerified(const TString& data) const; - - std::shared_ptr ApplyVerifiedColumn(const TString& data) const; -}; - -} diff --git a/ydb/core/tx/columnshard/engines/scheme/abstract/ya.make b/ydb/core/tx/columnshard/engines/scheme/abstract/ya.make index b830415daae1..79b12f94389e 100644 --- a/ydb/core/tx/columnshard/engines/scheme/abstract/ya.make +++ b/ydb/core/tx/columnshard/engines/scheme/abstract/ya.make @@ -1,15 +1,12 @@ LIBRARY() SRCS( - saver.cpp index_info.cpp - loader.cpp ) PEERDIR( ydb/library/actors/core - ydb/core/formats/arrow/transformer - ydb/core/formats/arrow/serializer + ydb/core/formats/arrow/save_load ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/tx/columnshard/engines/scheme/column/info.cpp b/ydb/core/tx/columnshard/engines/scheme/column/info.cpp index b9473398fdf3..9acf4085bdde 100644 --- a/ydb/core/tx/columnshard/engines/scheme/column/info.cpp +++ b/ydb/core/tx/columnshard/engines/scheme/column/info.cpp @@ -19,8 +19,7 @@ NArrow::NTransformation::ITransformer::TPtr TSimpleColumnInfo::GetLoadTransforme return transformer; } -TConclusionStatus TSimpleColumnInfo::DeserializeFromProto(const NKikimrSchemeOp::TOlapColumnDescription& columnInfo) -{ +TConclusionStatus TSimpleColumnInfo::DeserializeFromProto(const NKikimrSchemeOp::TOlapColumnDescription& columnInfo) { AFL_VERIFY(columnInfo.GetId() == ColumnId); if (columnInfo.HasSerializer()) { AFL_VERIFY(Serializer.DeserializeFromProto(columnInfo.GetSerializer())); @@ -30,13 +29,16 @@ TConclusionStatus TSimpleColumnInfo::DeserializeFromProto(const NKikimrSchemeOp: if (columnInfo.HasDefaultValue()) { DefaultValue.DeserializeFromProto(columnInfo.GetDefaultValue()).Validate(); } + if (columnInfo.HasDataAccessorConstructor()) { + AFL_VERIFY(DataAccessorConstructor.DeserializeFromProto(columnInfo.GetDataAccessorConstructor())); + } AFL_VERIFY(Serializer); if (columnInfo.HasDictionaryEncoding()) { auto settings = NArrow::NDictionary::TEncodingSettings::BuildFromProto(columnInfo.GetDictionaryEncoding()); Y_ABORT_UNLESS(settings.IsSuccess()); DictionaryEncoding = *settings; } - Loader = std::make_shared(GetLoadTransformer(), Serializer, ArrowSchema, DefaultValue.GetValue(), ColumnId); + Loader = std::make_shared(GetLoadTransformer(), Serializer, DataAccessorConstructor, ArrowField, DefaultValue.GetValue(), ColumnId); return TConclusionStatus::Success(); } @@ -45,14 +47,14 @@ TSimpleColumnInfo::TSimpleColumnInfo(const ui32 columnId, const std::shared_ptr< const std::shared_ptr& defaultValue) : ColumnId(columnId) , ArrowField(arrowField) - , ArrowSchema(std::make_shared(arrow::FieldVector({arrowField}))) , Serializer(serializer) , NeedMinMax(needMinMax) , IsSorted(isSorted) , DefaultValue(defaultValue) { ColumnName = ArrowField->name(); - Loader = std::make_shared(GetLoadTransformer(), Serializer, ArrowSchema, DefaultValue.GetValue(), ColumnId); + Loader = std::make_shared( + GetLoadTransformer(), Serializer, DataAccessorConstructor, ArrowField, DefaultValue.GetValue(), ColumnId); } std::vector> TSimpleColumnInfo::ActualizeColumnData(const std::vector>& source, const TSimpleColumnInfo& sourceColumnFeatures) const { @@ -86,7 +88,7 @@ std::vector> TSimpleColumnInf } std::vector> result; for (auto&& s : source) { - auto data = NArrow::TStatusValidator::GetValid(sourceColumnFeatures.Loader->Apply(s->GetData())); + auto data = sourceColumnFeatures.Loader->ApplyRawVerified(s->GetData()); result.emplace_back(s->CopyWithAnotherBlob(GetColumnSaver().Apply(data), *this)); } return result; diff --git a/ydb/core/tx/columnshard/engines/scheme/column/info.h b/ydb/core/tx/columnshard/engines/scheme/column/info.h index 6db21f1fe876..6be98664940b 100644 --- a/ydb/core/tx/columnshard/engines/scheme/column/info.h +++ b/ydb/core/tx/columnshard/engines/scheme/column/info.h @@ -1,17 +1,18 @@ #pragma once -#include -#include -#include - +#include +#include #include +#include +#include #include #include -#include +#include +#include #include -#include #include +#include namespace NKikimr::NOlap { @@ -22,8 +23,9 @@ class TSimpleColumnInfo { YDB_READONLY(ui32, ColumnId, 0); YDB_READONLY_DEF(TString, ColumnName); YDB_READONLY_DEF(std::shared_ptr, ArrowField); - YDB_READONLY_DEF(std::shared_ptr, ArrowSchema); YDB_READONLY(NArrow::NSerialization::TSerializerContainer, Serializer, NArrow::NSerialization::TSerializerContainer::GetDefaultSerializer()); + YDB_READONLY( + NArrow::NAccessor::TConstructorContainer, DataAccessorConstructor, NArrow::NAccessor::TConstructorContainer::GetDefaultConstructor()); YDB_READONLY(bool, NeedMinMax, false); YDB_READONLY(bool, IsSorted, false); YDB_READONLY_DEF(TColumnDefaultScalarValue, DefaultValue); @@ -32,8 +34,7 @@ class TSimpleColumnInfo { NArrow::NTransformation::ITransformer::TPtr GetLoadTransformer() const; public: - - TSimpleColumnInfo(const ui32 columnId, const std::shared_ptr& arrowField, + TSimpleColumnInfo(const ui32 columnId, const std::shared_ptr& arrowField, const NArrow::NSerialization::TSerializerContainer& serializer, const bool needMinMax, const bool isSorted, const std::shared_ptr& defaultValue); @@ -43,7 +44,8 @@ class TSimpleColumnInfo { return TColumnSaver(transformer, Serializer); } - std::vector> ActualizeColumnData(const std::vector>& source, const TSimpleColumnInfo& sourceColumnFeatures) const; + std::vector> ActualizeColumnData( + const std::vector>& source, const TSimpleColumnInfo& sourceColumnFeatures) const; TString DebugString() const { TStringBuilder sb; @@ -62,4 +64,4 @@ class TSimpleColumnInfo { } }; -} // namespace NKikimr::NOlap +} // namespace NKikimr::NOlap diff --git a/ydb/core/tx/columnshard/engines/scheme/column_features.h b/ydb/core/tx/columnshard/engines/scheme/column_features.h index 671b35e57734..cacc3078e559 100644 --- a/ydb/core/tx/columnshard/engines/scheme/column_features.h +++ b/ydb/core/tx/columnshard/engines/scheme/column_features.h @@ -1,6 +1,4 @@ #pragma once -#include "abstract/loader.h" -#include "abstract/saver.h" #include "column/info.h" #include @@ -10,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/scheme/index_info.cpp b/ydb/core/tx/columnshard/engines/scheme/index_info.cpp index 5bdfc2838eb9..1e4bbab0d589 100644 --- a/ydb/core/tx/columnshard/engines/scheme/index_info.cpp +++ b/ydb/core/tx/columnshard/engines/scheme/index_info.cpp @@ -346,20 +346,6 @@ void TIndexInfo::InitializeCaches(const std::shared_ptr& opera } } -std::vector> TIndexInfo::MakeEmptyChunks(const ui32 columnId, const std::vector& pages, const TSimpleColumnInfo& columnInfo) const { - std::vector> result; - auto columnArrowSchema = GetColumnSchema(columnId); - TColumnSaver saver = GetColumnSaver(columnId); - ui32 idx = 0; - for (auto p : pages) { - auto arr = NArrow::MakeEmptyBatch(columnArrowSchema, p); - AFL_VERIFY(arr->num_columns() == 1)("count", arr->num_columns()); - result.emplace_back(std::make_shared(saver.Apply(arr), arr->column(0), TChunkAddress(columnId, idx), columnInfo)); - ++idx; - } - return result; -} - NSplitter::TEntityGroups TIndexInfo::GetEntityGroupsByStorageId(const TString& specialTier, const IStoragesManager& storages) const { NSplitter::TEntityGroups groups(storages.GetDefaultOperator()->GetBlobSplitSettings(), IStoragesManager::DefaultStorageId); for (auto&& i : GetEntityIds()) { diff --git a/ydb/core/tx/columnshard/engines/scheme/index_info.h b/ydb/core/tx/columnshard/engines/scheme/index_info.h index 622f8c741050..be70ac1ae310 100644 --- a/ydb/core/tx/columnshard/engines/scheme/index_info.h +++ b/ydb/core/tx/columnshard/engines/scheme/index_info.h @@ -97,8 +97,6 @@ struct TIndexInfo : public NTable::TScheme::TTableSchema, public IIndexInfo { return result; } - std::vector> MakeEmptyChunks(const ui32 columnId, const std::vector& pages, const TSimpleColumnInfo& columnInfo) const; - const THashMap& GetIndexes() const { return Indexes; } diff --git a/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.cpp b/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.cpp index 41b11c7b3122..c6efc7a55678 100644 --- a/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.cpp +++ b/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.cpp @@ -46,9 +46,8 @@ TConclusion> ISnapshotSchema::Normali return conclusion; } } else if (restoreColumnIds.contains(columnId)) { - result->AddField(resultField, - NArrow::TThreadSimpleArraysCache::Get(resultField->type(), GetExternalDefaultValueVerified(columnId), batch->num_rows())) - .Validate(); + AFL_VERIFY(!!GetExternalDefaultValueVerified(columnId) || GetIndexInfo().IsNullableVerified(columnId)); + result->AddField(resultField, GetColumnLoaderVerified(columnId)->BuildDefaultAccessor(batch->num_rows())).Validate(); } } return result; @@ -145,19 +144,19 @@ std::shared_ptr ISnapshotSchema::GetFieldByColumnIdVerified(const return result; } -std::shared_ptr ISnapshotSchema::GetColumnLoaderVerified(const ui32 columnId) const { +std::shared_ptr ISnapshotSchema::GetColumnLoaderVerified(const ui32 columnId) const { auto result = GetColumnLoaderOptional(columnId); AFL_VERIFY(result); return result; } -std::shared_ptr ISnapshotSchema::GetColumnLoaderVerified(const std::string& columnName) const { +std::shared_ptr ISnapshotSchema::GetColumnLoaderVerified(const std::string& columnName) const { auto result = GetColumnLoaderOptional(columnName); AFL_VERIFY(result); return result; } -std::shared_ptr ISnapshotSchema::GetColumnLoaderOptional(const std::string& columnName) const { +std::shared_ptr ISnapshotSchema::GetColumnLoaderOptional(const std::string& columnName) const { const std::optional id = GetColumnIdOptional(columnName); if (id) { return GetColumnLoaderOptional(*id); diff --git a/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.h b/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.h index a2a4cefd9215..7153866b3387 100644 --- a/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.h +++ b/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.h @@ -1,8 +1,8 @@ #pragma once #include -#include -#include +#include +#include #include #include @@ -21,18 +21,18 @@ class ISnapshotSchema { using TPtr = std::shared_ptr; virtual ~ISnapshotSchema() {} - virtual std::shared_ptr GetColumnLoaderOptional(const ui32 columnId) const = 0; - std::shared_ptr GetColumnLoaderVerified(const ui32 columnId) const; - std::shared_ptr GetColumnLoaderOptional(const std::string& columnName) const; - std::shared_ptr GetColumnLoaderVerified(const std::string& columnName) const; + virtual std::shared_ptr GetColumnLoaderOptional(const ui32 columnId) const = 0; + std::shared_ptr GetColumnLoaderVerified(const ui32 columnId) const; + std::shared_ptr GetColumnLoaderOptional(const std::string& columnName) const; + std::shared_ptr GetColumnLoaderVerified(const std::string& columnName) const; bool IsSpecialColumnId(const ui32 columnId) const; - virtual TColumnSaver GetColumnSaver(const ui32 columnId) const = 0; - TColumnSaver GetColumnSaver(const TString& columnName) const { + virtual NArrow::NAccessor::TColumnSaver GetColumnSaver(const ui32 columnId) const = 0; + NArrow::NAccessor::TColumnSaver GetColumnSaver(const TString& columnName) const { return GetColumnSaver(GetColumnId(columnName)); } - TColumnSaver GetColumnSaver(const std::string& columnName) const { + NArrow::NAccessor::TColumnSaver GetColumnSaver(const std::string& columnName) const { return GetColumnSaver(TString(columnName.data(), columnName.size())); } diff --git a/ydb/core/tx/columnshard/engines/scheme/ya.make b/ydb/core/tx/columnshard/engines/scheme/ya.make index 8e41573bf419..744458ff4dcb 100644 --- a/ydb/core/tx/columnshard/engines/scheme/ya.make +++ b/ydb/core/tx/columnshard/engines/scheme/ya.make @@ -20,6 +20,7 @@ PEERDIR( ydb/core/tx/columnshard/engines/scheme/tiering ydb/core/tx/columnshard/engines/scheme/column ydb/core/tx/columnshard/engines/scheme/defaults + ydb/core/formats/arrow/accessor ydb/core/tx/columnshard/blobs_action/abstract ) diff --git a/ydb/core/tx/columnshard/engines/storage/chunks/column.cpp b/ydb/core/tx/columnshard/engines/storage/chunks/column.cpp index 485802b0e3c1..4a527f913fc7 100644 --- a/ydb/core/tx/columnshard/engines/storage/chunks/column.cpp +++ b/ydb/core/tx/columnshard/engines/storage/chunks/column.cpp @@ -1,18 +1,20 @@ #include "column.h" -#include +#include namespace NKikimr::NOlap::NChunks { -std::vector> TChunkPreparation::DoInternalSplitImpl(const TColumnSaver& saver, const std::shared_ptr& counters, const std::vector& splitSizes) const { - auto rb = NArrow::TStatusValidator::GetValid(ColumnInfo.GetLoader()->Apply(Data)); +std::vector> TChunkPreparation::DoInternalSplitImpl( + const TColumnSaver& saver, const std::shared_ptr& /*counters*/, const std::vector& splitSizes) const { + auto accessor = ColumnInfo.GetLoader()->ApplyVerified(Data, GetRecordsCountVerified()); + std::vector chunks = accessor->SplitBySizes(saver, Data, splitSizes); - auto chunks = TSimpleSplitter(saver, counters).SplitBySizes(rb, Data, splitSizes); std::vector> newChunks; for (auto&& i : chunks) { - Y_ABORT_UNLESS(i.GetSlicedBatch()->num_columns() == 1); - newChunks.emplace_back(std::make_shared(saver.Apply(i.GetSlicedBatch()), i.GetSlicedBatch()->column(0), TChunkAddress(GetColumnId(), GetChunkIdxOptional().value_or(0)), ColumnInfo)); + newChunks.emplace_back(std::make_shared( + i.GetSerializedData(), i.GetArray(), TChunkAddress(GetColumnId(), GetChunkIdxOptional().value_or(0)), ColumnInfo)); } + return newChunks; } -} +} // namespace NKikimr::NOlap::NChunks diff --git a/ydb/core/tx/columnshard/engines/storage/chunks/column.h b/ydb/core/tx/columnshard/engines/storage/chunks/column.h index a8c4be1ae3de..9de818c49fb6 100644 --- a/ydb/core/tx/columnshard/engines/storage/chunks/column.h +++ b/ydb/core/tx/columnshard/engines/storage/chunks/column.h @@ -56,14 +56,14 @@ class TChunkPreparation: public IPortionColumnChunk { AFL_VERIFY(Data.size() == Record.BlobRange.Size || Record.BlobRange.Size == 0)("data", Data.size())("record", Record.BlobRange.Size); } - TChunkPreparation(const TString& data, const std::shared_ptr& column, const TChunkAddress& address, const TSimpleColumnInfo& columnInfo) + TChunkPreparation(const TString& data, const std::shared_ptr& column, const TChunkAddress& address, const TSimpleColumnInfo& columnInfo) : TBase(address.GetColumnId()) , Data(data) , Record(address, column, columnInfo) , ColumnInfo(columnInfo) { - Y_ABORT_UNLESS(column->length()); - First = NArrow::TStatusValidator::GetValid(column->GetScalar(0)); - Last = NArrow::TStatusValidator::GetValid(column->GetScalar(column->length() - 1)); + Y_ABORT_UNLESS(column->GetRecordsCount()); + First = column->GetScalar(0); + Last = column->GetScalar(column->GetRecordsCount() - 1); Record.BlobRange.Size = data.size(); } }; diff --git a/ydb/core/tx/columnshard/engines/storage/chunks/data.cpp b/ydb/core/tx/columnshard/engines/storage/chunks/data.cpp index 79613b5b9798..007dff83e914 100644 --- a/ydb/core/tx/columnshard/engines/storage/chunks/data.cpp +++ b/ydb/core/tx/columnshard/engines/storage/chunks/data.cpp @@ -9,4 +9,13 @@ void TPortionIndexChunk::DoAddIntoPortionBeforeBlob(const TBlobRangeLink16& bRan portionInfo.AddIndex(TIndexChunk(GetEntityId(), GetChunkIdxVerified(), RecordsCount, RawBytes, bRange)); } +std::shared_ptr TPortionIndexChunk::DoCopyWithAnotherBlob( + TString&& data, const TSimpleColumnInfo& /*columnInfo*/) const { + return std::make_shared(GetChunkAddressVerified(), RecordsCount, RawBytes, std::move(data)); +} + +void TPortionIndexChunk::DoAddInplaceIntoPortion(TPortionInfoConstructor& portionInfo) const { + portionInfo.AddIndex(TIndexChunk(GetEntityId(), GetChunkIdxVerified(), RecordsCount, RawBytes, GetData())); +} + } // namespace NKikimr::NOlap::NIndexes \ No newline at end of file diff --git a/ydb/core/tx/columnshard/engines/storage/chunks/data.h b/ydb/core/tx/columnshard/engines/storage/chunks/data.h index 8409243df347..e3f22ae2ed9d 100644 --- a/ydb/core/tx/columnshard/engines/storage/chunks/data.h +++ b/ydb/core/tx/columnshard/engines/storage/chunks/data.h @@ -36,9 +36,9 @@ class TPortionIndexChunk: public IPortionDataChunk { return nullptr; } virtual void DoAddIntoPortionBeforeBlob(const TBlobRangeLink16& bRange, TPortionInfoConstructor& portionInfo) const override; - virtual std::shared_ptr DoCopyWithAnotherBlob(TString&& data, const TSimpleColumnInfo& /*columnInfo*/) const override { - return std::make_shared(GetChunkAddressVerified(), RecordsCount, RawBytes, std::move(data)); - } + virtual std::shared_ptr DoCopyWithAnotherBlob(TString&& data, const TSimpleColumnInfo& /*columnInfo*/) const override; + virtual void DoAddInplaceIntoPortion(TPortionInfoConstructor& portionInfo) const override; + public: TPortionIndexChunk(const TChunkAddress& address, const ui32 recordsCount, const ui64 rawBytes, const TString& data) : TBase(address.GetColumnId(), address.GetChunkIdx()) diff --git a/ydb/core/tx/columnshard/engines/storage/chunks/null_column.cpp b/ydb/core/tx/columnshard/engines/storage/chunks/null_column.cpp deleted file mode 100644 index 9aa56e56eda3..000000000000 --- a/ydb/core/tx/columnshard/engines/storage/chunks/null_column.cpp +++ /dev/null @@ -1,5 +0,0 @@ -#include "null_column.h" - -namespace NKikimr::NOlap::NChunks { - -} diff --git a/ydb/core/tx/columnshard/engines/storage/chunks/null_column.h b/ydb/core/tx/columnshard/engines/storage/chunks/null_column.h deleted file mode 100644 index 3b2420f0c223..000000000000 --- a/ydb/core/tx/columnshard/engines/storage/chunks/null_column.h +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once -#include -#include -#include -#include - -namespace NKikimr::NOlap::NChunks { - -class TDefaultChunkPreparation: public IPortionColumnChunk { -private: - using TBase = IPortionColumnChunk; - const std::shared_ptr DefaultValue; - const ui32 RecordsCount; - ui64 RawBytes = 0; - TString Data; -protected: - virtual std::vector> DoInternalSplitImpl(const TColumnSaver& /*saver*/, const std::shared_ptr& /*counters*/, - const std::vector& /*splitSizes*/) const override { - AFL_VERIFY(false); - return {}; - } - virtual const TString& DoGetData() const override { - return Data; - } - virtual ui32 DoGetRecordsCountImpl() const override { - return RecordsCount; - } - virtual ui64 DoGetRawBytesImpl() const override { - return RawBytes; - } - virtual TString DoDebugString() const override { - return TStringBuilder() << "rc=" << RecordsCount << ";data_size=" << Data.size() << ";"; - } - virtual TSimpleChunkMeta DoBuildSimpleChunkMeta() const override { - AFL_VERIFY(false); - return TSimpleChunkMeta(nullptr, false, false); - } - virtual std::shared_ptr DoGetFirstScalar() const override { - return DefaultValue; - } - virtual std::shared_ptr DoGetLastScalar() const override { - return DefaultValue; - } - -public: - TDefaultChunkPreparation(const ui32 columnId, const ui32 recordsCount, const std::shared_ptr& f, - const std::shared_ptr& defaultValue, const TColumnSaver& saver) - : TBase(columnId) - , DefaultValue(defaultValue) - , RecordsCount(recordsCount) - { - Y_ABORT_UNLESS(RecordsCount); - auto arrowData = NArrow::TThreadSimpleArraysCache::Get(f->type(), defaultValue, RecordsCount); - RawBytes = NArrow::GetArrayDataSize(arrowData); - Data = saver.Apply(arrowData, f); - SetChunkIdx(0); - } -}; - -} diff --git a/ydb/core/tx/columnshard/engines/storage/chunks/ya.make b/ydb/core/tx/columnshard/engines/storage/chunks/ya.make index d61554bd6f0c..cff5b9f40b53 100644 --- a/ydb/core/tx/columnshard/engines/storage/chunks/ya.make +++ b/ydb/core/tx/columnshard/engines/storage/chunks/ya.make @@ -3,7 +3,6 @@ LIBRARY() SRCS( data.cpp column.cpp - null_column.cpp ) PEERDIR( diff --git a/ydb/core/tx/columnshard/engines/storage/granule/granule.h b/ydb/core/tx/columnshard/engines/storage/granule/granule.h index c8b3e302f1e7..d79ef50e1883 100644 --- a/ydb/core/tx/columnshard/engines/storage/granule/granule.h +++ b/ydb/core/tx/columnshard/engines/storage/granule/granule.h @@ -21,9 +21,10 @@ class TColumnChunkLoadContext; class TDataClassSummary: public NColumnShard::TBaseGranuleDataClassSummary { private: friend class TGranuleMeta; - THashMap ColumnStats; + THashMap ColumnStats; + public: - const THashMap& GetColumnStats() const { + const THashMap& GetColumnStats() const { return ColumnStats; } @@ -231,11 +232,11 @@ class TGranuleMeta: TNonCopyable { } } - std::shared_ptr BuildSerializationStats(ISnapshotSchema::TPtr schema) const { - auto result = std::make_shared(); + std::shared_ptr BuildSerializationStats(ISnapshotSchema::TPtr schema) const { + auto result = std::make_shared(); for (auto&& i : GetAdditiveSummary().GetCompacted().GetColumnStats()) { auto field = schema->GetFieldByColumnIdVerified(i.first); - NOlap::TColumnSerializationStat columnInfo(i.first, field->name()); + NArrow::NSplitter::TColumnSerializationStat columnInfo(i.first, field->name()); columnInfo.Merge(i.second); result->AddStat(columnInfo); } diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp index 3556cb3d1a9d..553daec4f0fe 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp @@ -16,8 +16,8 @@ TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader) const { { TChunkedColumnReader cReader = *reader.begin(); for (reader.Start(); cReader.IsCorrect(); cReader.ReadNextChunk()) { - auto minMax = NArrow::FindMinMaxPosition(cReader.GetCurrentChunk()); - auto currentScalar = NArrow::GetScalar(cReader.GetCurrentChunk(), minMax.second); + auto currentScalar = cReader.GetCurrentAccessor()->GetMaxScalar(); + AFL_VERIFY(currentScalar); if (!result || NArrow::ScalarCompare(*result, *currentScalar) == -1) { result = currentScalar; } diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.cpp index cd5f4f3c51cc..e62bc99d0a7f 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.cpp @@ -6,16 +6,6 @@ namespace NKikimr::NOlap::NIndexes { -void TPortionIndexChunk::DoAddIntoPortionBeforeBlob( - const TBlobRangeLink16& bRange, TPortionInfoConstructor& portionInfo) const { - AFL_VERIFY(!bRange.IsValid()); - portionInfo.AddIndex(TIndexChunk(GetEntityId(), GetChunkIdxVerified(), RecordsCount, RawBytes, bRange)); -} - -void TPortionIndexChunk::DoAddInplaceIntoPortion(TPortionInfoConstructor& portionInfo) const { - portionInfo.AddIndex(TIndexChunk(GetEntityId(), GetChunkIdxVerified(), RecordsCount, RawBytes, GetData())); -} - std::shared_ptr TIndexByColumns::DoBuildIndex( const THashMap>>& data, const TIndexInfo& indexInfo) const { AFL_VERIFY(Serializer); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.h b/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.h index 3f2f5dfb872f..427ee98d99d2 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.h @@ -1,56 +1,10 @@ #pragma once #include -#include +#include #include namespace NKikimr::NOlap::NIndexes { -class TPortionIndexChunk: public IPortionDataChunk { -private: - using TBase = IPortionDataChunk; - const ui32 RecordsCount; - const ui64 RawBytes; - const TString Data; -protected: - virtual const TString& DoGetData() const override { - return Data; - } - virtual TString DoDebugString() const override { - return ""; - } - virtual std::vector> DoInternalSplit(const TColumnSaver& /*saver*/, const std::shared_ptr& /*counters*/, const std::vector& /*splitSizes*/) const override { - AFL_VERIFY(false); - return {}; - } - virtual bool DoIsSplittable() const override { - return false; - } - virtual std::optional DoGetRecordsCount() const override { - return RecordsCount; - } - virtual std::optional DoGetRawBytes() const override { - return RawBytes; - } - virtual std::shared_ptr DoGetFirstScalar() const override { - return nullptr; - } - virtual std::shared_ptr DoGetLastScalar() const override { - return nullptr; - } - virtual void DoAddIntoPortionBeforeBlob(const TBlobRangeLink16& bRange, TPortionInfoConstructor& portionInfo) const override; - virtual void DoAddInplaceIntoPortion(TPortionInfoConstructor& portionInfo) const override; - -public: - TPortionIndexChunk(const TChunkAddress& address, const ui32 recordsCount, const ui64 rawBytes, const TString& data) - : TBase(address.GetColumnId(), address.GetChunkIdx()) - , RecordsCount(recordsCount) - , RawBytes(rawBytes) - , Data(data) - { - } - -}; - class TIndexByColumns: public IIndexMeta { private: using TBase = IIndexMeta; diff --git a/ydb/core/tx/columnshard/normalizer/portion/chunks.cpp b/ydb/core/tx/columnshard/normalizer/portion/chunks.cpp index d7981d98d629..f42f38061e45 100644 --- a/ydb/core/tx/columnshard/normalizer/portion/chunks.cpp +++ b/ydb/core/tx/columnshard/normalizer/portion/chunks.cpp @@ -59,11 +59,12 @@ class TRowsAndBytesChangesTask: public NConveyor::ITask { Y_ABORT_UNLESS(!!columnLoader); TPortionInfo::TAssembleBlobInfo assembleBlob(blobData); + assembleBlob.SetExpectedRecordsCount(chunkInfo.GetRecordsCount()); auto batch = assembleBlob.BuildRecordBatch(*columnLoader); Y_ABORT_UNLESS(!!batch); - chunkInfo.MutableUpdate().SetNumRows(batch->num_rows()); - chunkInfo.MutableUpdate().SetRawBytes(NArrow::GetBatchDataSize(batch)); + chunkInfo.MutableUpdate().SetNumRows(batch->GetRecordsCount()); + chunkInfo.MutableUpdate().SetRawBytes(batch->GetRawSizeVerified()); } auto changes = std::make_shared(std::move(Chunks)); diff --git a/ydb/core/tx/columnshard/normalizer/portion/chunks.h b/ydb/core/tx/columnshard/normalizer/portion/chunks.h index 59fbfe57da0c..c8a09669c7b8 100644 --- a/ydb/core/tx/columnshard/normalizer/portion/chunks.h +++ b/ydb/core/tx/columnshard/normalizer/portion/chunks.h @@ -74,6 +74,10 @@ namespace NKikimr::NOlap { , CLContext(rowset, dsGroupSelector) {} + ui32 GetRecordsCount() const { + return CLContext.GetMetaProto().GetNumRows(); + } + const TBlobRange& GetBlobRange() const { return CLContext.GetBlobRange(); } diff --git a/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.cpp b/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.cpp index 646a458638dd..1e66bfb46e07 100644 --- a/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.cpp +++ b/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.cpp @@ -4,23 +4,20 @@ namespace NKikimr::NOlap { -TSimpleChunkMeta::TSimpleChunkMeta(const std::shared_ptr& column, const bool needMax, const bool isSortedColumn) { +TSimpleChunkMeta::TSimpleChunkMeta( + const std::shared_ptr& column, const bool needMax, const bool isSortedColumn) { Y_ABORT_UNLESS(column); - Y_ABORT_UNLESS(column->length()); - NumRows = column->length(); - RawBytes = NArrow::GetArrayDataSize(column); + Y_ABORT_UNLESS(column->GetRecordsCount()); + NumRows = column->GetRecordsCount(); + RawBytes = column->GetRawSizeVerified(); if (needMax) { - std::pair minMaxPos = {0, (column->length() - 1)}; if (!isSortedColumn) { - minMaxPos = NArrow::FindMinMaxPosition(column); - Y_ABORT_UNLESS(minMaxPos.first >= 0); - Y_ABORT_UNLESS(minMaxPos.second >= 0); + Max = column->GetMaxScalar(); + } else { + Max = column->GetScalar(column->GetRecordsCount() - 1); } - - Max = NArrow::GetScalar(column, minMaxPos.second); - - Y_ABORT_UNLESS(Max); +// AFL_VERIFY(Max); } } diff --git a/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.h b/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.h index 8f8f902e4095..0e54b4c9e42a 100644 --- a/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.h +++ b/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.h @@ -1,4 +1,6 @@ #pragma once +#include + #include #include @@ -17,7 +19,7 @@ class TSimpleChunkMeta { ui32 RawBytes = 0; TSimpleChunkMeta() = default; public: - TSimpleChunkMeta(const std::shared_ptr& column, const bool needMinMax, const bool isSortedColumn); + TSimpleChunkMeta(const std::shared_ptr& column, const bool needMinMax, const bool isSortedColumn); ui64 GetMetadataSize() const { return sizeof(ui32) + sizeof(ui32) + 8 * 3 * 2; diff --git a/ydb/core/tx/columnshard/splitter/abstract/chunks.h b/ydb/core/tx/columnshard/splitter/abstract/chunks.h index 3463fcea0374..d0300915f098 100644 --- a/ydb/core/tx/columnshard/splitter/abstract/chunks.h +++ b/ydb/core/tx/columnshard/splitter/abstract/chunks.h @@ -15,7 +15,6 @@ namespace NKikimr::NOlap { class TPortionInfo; class TPortionInfoConstructor; class TSimpleColumnInfo; -class TColumnSaver; class IPortionDataChunk { private: diff --git a/ydb/core/tx/columnshard/splitter/batch_slice.cpp b/ydb/core/tx/columnshard/splitter/batch_slice.cpp index 15fd2506e3ba..7f6cc05c1e7b 100644 --- a/ydb/core/tx/columnshard/splitter/batch_slice.cpp +++ b/ydb/core/tx/columnshard/splitter/batch_slice.cpp @@ -1,5 +1,4 @@ #include "batch_slice.h" -#include "simple.h" #include namespace NKikimr::NOlap { @@ -148,7 +147,8 @@ bool TGeneralSerializedSlice::GroupBlobsImpl(const NSplitter::TGroupFeatures& fe return true; } -TGeneralSerializedSlice::TGeneralSerializedSlice(const THashMap>>& data, ISchemaDetailInfo::TPtr schema, std::shared_ptr counters) +TGeneralSerializedSlice::TGeneralSerializedSlice(const THashMap>>& data, + NArrow::NSplitter::ISchemaDetailInfo::TPtr schema, std::shared_ptr counters) : Schema(schema) , Counters(counters) { std::optional recordsCount; @@ -169,60 +169,14 @@ TGeneralSerializedSlice::TGeneralSerializedSlice(const THashMap counters) +TGeneralSerializedSlice::TGeneralSerializedSlice( + const ui32 recordsCount, NArrow::NSplitter::ISchemaDetailInfo::TPtr schema, std::shared_ptr counters) : RecordsCount(recordsCount) , Schema(schema) , Counters(counters) { } -TBatchSerializedSlice::TBatchSerializedSlice(const std::shared_ptr& batch, ISchemaDetailInfo::TPtr schema, std::shared_ptr counters, - const NSplitter::TSplitSettings& settings) - : TBase(TValidator::CheckNotNull(batch)->num_rows(), schema, counters) - , Batch(batch) -{ - Y_ABORT_UNLESS(batch); - Data.reserve(batch->num_columns()); - for (auto&& i : batch->schema()->fields()) { - TSplittedEntity c(schema->GetColumnId(i->name())); - Data.emplace_back(std::move(c)); - } - - ui32 idx = 0; - for (auto&& i : batch->columns()) { - auto& c = Data[idx]; - auto columnSaver = schema->GetColumnSaver(c.GetEntityId()); - auto stats = schema->GetColumnSerializationStats(c.GetEntityId()); - TSimpleSplitter splitter(columnSaver, Counters); - splitter.SetStats(stats); - std::vector> chunks; - for (auto&& i : splitter.Split(i, Schema->GetField(c.GetEntityId()), settings.GetMaxBlobSize())) { - chunks.emplace_back(std::make_shared(c.GetEntityId(), i, Schema)); - } - c.SetChunks(chunks); - Size += c.GetSize(); - ++idx; - } -} - -std::vector TBatchSerializedSlice::BuildSimpleSlices(const std::shared_ptr& batch, const NSplitter::TSplitSettings& settings, const std::shared_ptr& counters, const ISchemaDetailInfo::TPtr& schemaInfo) { - std::vector slices; - auto stats = schemaInfo->GetBatchSerializationStats(batch); - ui32 recordsCount = settings.GetMinRecordsCount(); - if (stats) { - const ui32 recordsCountForMinSize = stats->PredictOptimalPackRecordsCount(batch->num_rows(), settings.GetMinBlobSize()).value_or(recordsCount); - const ui32 recordsCountForMaxPortionSize = stats->PredictOptimalPackRecordsCount(batch->num_rows(), settings.GetMaxPortionSize()).value_or(recordsCount); - recordsCount = std::min(recordsCountForMaxPortionSize, std::max(recordsCount, recordsCountForMinSize)); - } - auto linearSplitInfo = TSimpleSplitter::GetOptimalLinearSplitting(batch->num_rows(), recordsCount); - for (auto it = linearSplitInfo.StartIterator(); it.IsValid(); it.Next()) { - std::shared_ptr current = batch->Slice(it.GetPosition(), it.GetCurrentPackSize()); - TBatchSerializedSlice slice(current, schemaInfo, counters, settings); - slices.emplace_back(std::move(slice)); - } - return slices; -} - void TGeneralSerializedSlice::MergeSlice(TGeneralSerializedSlice&& slice) { Y_ABORT_UNLESS(Data.size() == slice.Data.size()); RecordsCount += slice.GetRecordsCount(); diff --git a/ydb/core/tx/columnshard/splitter/batch_slice.h b/ydb/core/tx/columnshard/splitter/batch_slice.h index 00a27a3d4739..3b2dd6f1bf8e 100644 --- a/ydb/core/tx/columnshard/splitter/batch_slice.h +++ b/ydb/core/tx/columnshard/splitter/batch_slice.h @@ -1,10 +1,10 @@ #pragma once #include "chunks.h" -#include "stats.h" -#include "scheme_info.h" #include "column_info.h" #include "blob_info.h" -#include "similar_packer.h" +#include +#include +#include #include #include #include @@ -14,16 +14,17 @@ namespace NKikimr::NOlap { -class TDefaultSchemaDetails: public ISchemaDetailInfo { +class TDefaultSchemaDetails: public NArrow::NSplitter::ISchemaDetailInfo { private: ISnapshotSchema::TPtr Schema; - std::shared_ptr Stats; + std::shared_ptr Stats; + protected: virtual TColumnSaver DoGetColumnSaver(const ui32 columnId) const override { return Schema->GetColumnSaver(columnId); } public: - TDefaultSchemaDetails(ISnapshotSchema::TPtr schema, const std::shared_ptr& stats) + TDefaultSchemaDetails(ISnapshotSchema::TPtr schema, const std::shared_ptr& stats) : Schema(schema) , Stats(stats) { @@ -39,14 +40,15 @@ class TDefaultSchemaDetails: public ISchemaDetailInfo { return Schema->GetIndexInfo().IsSortedColumn(columnId); } - virtual std::optional GetColumnSerializationStats(const ui32 columnId) const override { + virtual std::optional GetColumnSerializationStats(const ui32 columnId) const override { auto stats = Stats->GetColumnInfo(columnId); if (stats && stats->GetRecordsCount() != 0) { return stats; } return std::nullopt; } - virtual std::optional GetBatchSerializationStats(const std::shared_ptr& rb) const override { + virtual std::optional GetBatchSerializationStats( + const std::shared_ptr& rb) const override { return Stats->GetStatsForRecordBatch(rb); } virtual ui32 GetColumnId(const std::string& fieldName) const override { @@ -61,7 +63,7 @@ class TGeneralSerializedSlice { protected: std::vector Data; ui64 Size = 0; - ISchemaDetailInfo::TPtr Schema; + NArrow::NSplitter::ISchemaDetailInfo::TPtr Schema; std::shared_ptr Counters; TGeneralSerializedSlice() = default; @@ -116,15 +118,17 @@ class TGeneralSerializedSlice { return blobs; } - explicit TGeneralSerializedSlice(TVectorView&& objects) { + explicit TGeneralSerializedSlice(NArrow::NSplitter::TVectorView&& objects) { Y_ABORT_UNLESS(objects.size()); std::swap(*this, objects.front()); for (ui32 i = 1; i < objects.size(); ++i) { MergeSlice(std::move(objects[i])); } } - TGeneralSerializedSlice(const THashMap>>& data, ISchemaDetailInfo::TPtr schema, std::shared_ptr counters); - TGeneralSerializedSlice(const ui32 recordsCount, ISchemaDetailInfo::TPtr schema, std::shared_ptr counters); + TGeneralSerializedSlice(const THashMap>>& data, + NArrow::NSplitter::ISchemaDetailInfo::TPtr schema, std::shared_ptr counters); + TGeneralSerializedSlice( + const ui32 recordsCount, NArrow::NSplitter::ISchemaDetailInfo::TPtr schema, std::shared_ptr counters); void MergeSlice(TGeneralSerializedSlice&& slice); @@ -135,28 +139,4 @@ class TGeneralSerializedSlice { } }; -class TBatchSerializedSlice: public TGeneralSerializedSlice { -private: - using TBase = TGeneralSerializedSlice; - YDB_READONLY_DEF(std::shared_ptr, Batch); -public: - TBatchSerializedSlice(const std::shared_ptr& batch, ISchemaDetailInfo::TPtr schema, std::shared_ptr counters, const NSplitter::TSplitSettings& settings); - - explicit TBatchSerializedSlice(TVectorView&& objects) { - Y_ABORT_UNLESS(objects.size()); - std::swap(*this, objects.front()); - for (ui32 i = 1; i < objects.size(); ++i) { - MergeSlice(std::move(objects[i])); - } - } - void MergeSlice(TBatchSerializedSlice&& slice) { - Batch = NArrow::CombineBatches({Batch, slice.Batch}); - TBase::MergeSlice(std::move(slice)); - } - - static std::vector BuildSimpleSlices(const std::shared_ptr& batch, const NSplitter::TSplitSettings& settings, - const std::shared_ptr& counters, const ISchemaDetailInfo::TPtr& schemaInfo); - -}; - } diff --git a/ydb/core/tx/columnshard/splitter/chunks.h b/ydb/core/tx/columnshard/splitter/chunks.h index 280f47d8c238..577f169865aa 100644 --- a/ydb/core/tx/columnshard/splitter/chunks.h +++ b/ydb/core/tx/columnshard/splitter/chunks.h @@ -27,8 +27,10 @@ class IPortionColumnChunk : public IPortionDataChunk { virtual void DoAddIntoPortionBeforeBlob(const TBlobRangeLink16& bRange, TPortionInfoConstructor& portionInfo) const override; - virtual std::vector> DoInternalSplitImpl(const TColumnSaver& saver, const std::shared_ptr& counters, const std::vector& splitSizes) const = 0; - virtual std::vector> DoInternalSplit(const TColumnSaver& saver, const std::shared_ptr& counters, const std::vector& splitSizes) const override; + virtual std::vector> DoInternalSplitImpl(const TColumnSaver& saver, + const std::shared_ptr& counters, const std::vector& splitSizes) const = 0; + virtual std::vector> DoInternalSplit(const TColumnSaver& saver, + const std::shared_ptr& counters, const std::vector& splitSizes) const override; virtual bool DoIsSplittable() const override { return GetRecordsCount() > 1; } @@ -53,7 +55,8 @@ class TChunkedColumnReader { std::vector> Chunks; std::shared_ptr Loader; - std::shared_ptr CurrentChunk; + std::shared_ptr CurrentChunk; + std::optional CurrentChunkArray; ui32 CurrentChunkIndex = 0; ui32 CurrentRecordIndex = 0; public: @@ -68,16 +71,29 @@ class TChunkedColumnReader { CurrentChunkIndex = 0; CurrentRecordIndex = 0; if (Chunks.size()) { - CurrentChunk = Loader->ApplyVerifiedColumn(Chunks.front()->GetData()); + CurrentChunk = Loader->ApplyVerified(Chunks.front()->GetData(), Chunks.front()->GetRecordsCountVerified()); + CurrentChunkArray.reset(); } } - const std::shared_ptr& GetCurrentChunk() const { + const std::shared_ptr& GetCurrentChunk() { + if (!CurrentChunkArray || CurrentChunkArray->GetFinishPosition() <= CurrentRecordIndex) { + CurrentChunkArray = CurrentChunk->GetChunk(CurrentChunkArray, CurrentRecordIndex); + } + AFL_VERIFY(CurrentChunkArray); + return CurrentChunkArray->GetArray(); + } + + const std::shared_ptr& GetCurrentAccessor() const { + AFL_VERIFY(CurrentChunk); return CurrentChunk; } - ui32 GetCurrentRecordIndex() const { - return CurrentRecordIndex; + ui32 GetCurrentRecordIndex() { + if (!CurrentChunkArray || CurrentChunkArray->GetFinishPosition() <= CurrentRecordIndex) { + CurrentChunkArray = CurrentChunk->GetChunk(CurrentChunkArray, CurrentRecordIndex); + } + return CurrentRecordIndex - CurrentChunkArray->GetStartPosition(); } bool IsCorrect() const { @@ -86,19 +102,21 @@ class TChunkedColumnReader { bool ReadNextChunk() { while (++CurrentChunkIndex < Chunks.size()) { - CurrentChunk = Loader->ApplyVerifiedColumn(Chunks[CurrentChunkIndex]->GetData()); + CurrentChunk = Loader->ApplyVerified(Chunks[CurrentChunkIndex]->GetData(), Chunks[CurrentChunkIndex]->GetRecordsCountVerified()); + CurrentChunkArray.reset(); CurrentRecordIndex = 0; - if (CurrentRecordIndex < CurrentChunk->length()) { + if (CurrentRecordIndex < CurrentChunk->GetRecordsCount()) { return true; } } + CurrentChunkArray.reset(); CurrentChunk = nullptr; return false; } bool ReadNext() { AFL_VERIFY(!!CurrentChunk); - if (++CurrentRecordIndex < CurrentChunk->length()) { + if (++CurrentRecordIndex < CurrentChunk->GetRecordsCount()) { return true; } return ReadNextChunk(); @@ -156,6 +174,14 @@ class TChunkedBatchReader { std::vector::const_iterator end() const { return Columns.end(); } + + std::vector::iterator begin() { + return Columns.begin(); + } + + std::vector::iterator end() { + return Columns.end(); + } }; } diff --git a/ydb/core/tx/columnshard/splitter/scheme_info.cpp b/ydb/core/tx/columnshard/splitter/scheme_info.cpp deleted file mode 100644 index fe4a65604e11..000000000000 --- a/ydb/core/tx/columnshard/splitter/scheme_info.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include "scheme_info.h" - -namespace NKikimr::NOlap { - -NKikimr::NOlap::TColumnSaver ISchemaDetailInfo::GetColumnSaver(const ui32 columnId) const { - auto saver = DoGetColumnSaver(columnId); - if (OverrideSerializer) { - saver.ResetSerializer(*OverrideSerializer); - } - return saver; -} - -} diff --git a/ydb/core/tx/columnshard/splitter/similar_packer.cpp b/ydb/core/tx/columnshard/splitter/similar_packer.cpp deleted file mode 100644 index 9d22b3a6b255..000000000000 --- a/ydb/core/tx/columnshard/splitter/similar_packer.cpp +++ /dev/null @@ -1,5 +0,0 @@ -#include "similar_packer.h" - -namespace NKikimr::NOlap { - -} diff --git a/ydb/core/tx/columnshard/splitter/ut/batch_slice.cpp b/ydb/core/tx/columnshard/splitter/ut/batch_slice.cpp new file mode 100644 index 000000000000..0d98c8324737 --- /dev/null +++ b/ydb/core/tx/columnshard/splitter/ut/batch_slice.cpp @@ -0,0 +1,65 @@ +#include "batch_slice.h" + +#include +#include +#include + +#include + +namespace NKikimr::NOlap { + +TBatchSerializedSlice::TBatchSerializedSlice(const std::shared_ptr& batch, NArrow::NSplitter::ISchemaDetailInfo::TPtr schema, + std::shared_ptr counters, const NSplitter::TSplitSettings& settings) + : TBase(TValidator::CheckNotNull(batch)->num_rows(), schema, counters) + , Batch(batch) { + Y_ABORT_UNLESS(batch); + Data.reserve(batch->num_columns()); + for (auto&& i : batch->schema()->fields()) { + TSplittedEntity c(schema->GetColumnId(i->name())); + Data.emplace_back(std::move(c)); + } + + ui32 idx = 0; + for (auto&& i : batch->columns()) { + auto& c = Data[idx]; + auto columnSaver = schema->GetColumnSaver(c.GetEntityId()); + auto stats = schema->GetColumnSerializationStats(c.GetEntityId()); + NKikimr::NArrow::NSplitter::TSimpleSplitter splitter(columnSaver); + splitter.SetStats(stats); + std::vector> chunks; + for (auto&& i : splitter.Split(i, Schema->GetField(c.GetEntityId()), settings.GetMaxBlobSize())) { + NOlap::TSimpleColumnInfo columnInfo(c.GetEntityId(), Schema->GetField(c.GetEntityId()), + Schema->GetColumnSaver(c.GetEntityId()).GetSerializer(), true, false, nullptr); + chunks.emplace_back(std::make_shared(i.GetSerializedChunk(), + std::make_shared(i.GetSlicedBatch()->column(0)), TChunkAddress(c.GetEntityId(), 0), + columnInfo)); + } + c.SetChunks(chunks); + Size += c.GetSize(); + ++idx; + } +} + +std::vector TBatchSerializedSlice::BuildSimpleSlices(const std::shared_ptr& batch, + const NSplitter::TSplitSettings& settings, const std::shared_ptr& counters, + const NArrow::NSplitter::ISchemaDetailInfo::TPtr& schemaInfo) { + std::vector slices; + auto stats = schemaInfo->GetBatchSerializationStats(batch); + ui32 recordsCount = settings.GetMinRecordsCount(); + if (stats) { + const ui32 recordsCountForMinSize = + stats->PredictOptimalPackRecordsCount(batch->num_rows(), settings.GetMinBlobSize()).value_or(recordsCount); + const ui32 recordsCountForMaxPortionSize = + stats->PredictOptimalPackRecordsCount(batch->num_rows(), settings.GetMaxPortionSize()).value_or(recordsCount); + recordsCount = std::min(recordsCountForMaxPortionSize, std::max(recordsCount, recordsCountForMinSize)); + } + auto linearSplitInfo = NKikimr::NArrow::NSplitter::TSimpleSplitter::GetOptimalLinearSplitting(batch->num_rows(), recordsCount); + for (auto it = linearSplitInfo.StartIterator(); it.IsValid(); it.Next()) { + std::shared_ptr current = batch->Slice(it.GetPosition(), it.GetCurrentPackSize()); + TBatchSerializedSlice slice(current, schemaInfo, counters, settings); + slices.emplace_back(std::move(slice)); + } + return slices; +} + +} // namespace NKikimr::NOlap diff --git a/ydb/core/tx/columnshard/splitter/ut/batch_slice.h b/ydb/core/tx/columnshard/splitter/ut/batch_slice.h new file mode 100644 index 000000000000..c085eb1568ce --- /dev/null +++ b/ydb/core/tx/columnshard/splitter/ut/batch_slice.h @@ -0,0 +1,31 @@ +#pragma once +#include + +namespace NKikimr::NOlap { + +class TBatchSerializedSlice: public TGeneralSerializedSlice { +private: + using TBase = TGeneralSerializedSlice; + YDB_READONLY_DEF(std::shared_ptr, Batch); + +public: + TBatchSerializedSlice(const std::shared_ptr& batch, NArrow::NSplitter::ISchemaDetailInfo::TPtr schema, + std::shared_ptr counters, const NSplitter::TSplitSettings& settings); + + explicit TBatchSerializedSlice(NArrow::NSplitter::TVectorView&& objects) { + Y_ABORT_UNLESS(objects.size()); + std::swap(*this, objects.front()); + for (ui32 i = 1; i < objects.size(); ++i) { + MergeSlice(std::move(objects[i])); + } + } + void MergeSlice(TBatchSerializedSlice&& slice) { + Batch = NArrow::CombineBatches({ Batch, slice.Batch }); + TBase::MergeSlice(std::move(slice)); + } + + static std::vector BuildSimpleSlices(const std::shared_ptr& batch, + const NSplitter::TSplitSettings& settings, const std::shared_ptr& counters, + const NArrow::NSplitter::ISchemaDetailInfo::TPtr& schemaInfo); +}; +} diff --git a/ydb/core/tx/columnshard/splitter/ut/ut_splitter.cpp b/ydb/core/tx/columnshard/splitter/ut/ut_splitter.cpp index f2f942dbbab3..06c6f5020f29 100644 --- a/ydb/core/tx/columnshard/splitter/ut/ut_splitter.cpp +++ b/ydb/core/tx/columnshard/splitter/ut/ut_splitter.cpp @@ -1,29 +1,32 @@ -#include -#include -#include -#include -#include -#include +#include "batch_slice.h" -#include -#include +#include +#include +#include #include #include - -#include +#include +#include +#include +#include +#include +#include +#include #include +#include Y_UNIT_TEST_SUITE(Splitter) { - using namespace NKikimr::NArrow; - class TTestSnapshotSchema: public NKikimr::NOlap::ISchemaDetailInfo { + class TTestSnapshotSchema: public NKikimr::NArrow::NSplitter::ISchemaDetailInfo { private: mutable std::map Decoder; + protected: - virtual NKikimr::NOlap::TColumnSaver DoGetColumnSaver(const ui32 columnId) const override { - return NKikimr::NOlap::TColumnSaver(nullptr, std::make_shared(arrow::ipc::IpcOptions::Defaults())); + virtual NKikimr::NArrow::NAccessor::TColumnSaver DoGetColumnSaver(const ui32 columnId) const override { + return NKikimr::NArrow::NAccessor::TColumnSaver( + nullptr, std::make_shared(arrow::ipc::IpcOptions::Defaults())); } public: @@ -34,17 +37,18 @@ Y_UNIT_TEST_SUITE(Splitter) { return false; } - virtual std::optional GetColumnSerializationStats(const ui32 /*columnId*/) const override { + virtual std::optional GetColumnSerializationStats( + const ui32 /*columnId*/) const override { return {}; } - virtual std::optional GetBatchSerializationStats(const std::shared_ptr& /*rb*/) const override { + virtual std::optional GetBatchSerializationStats( + const std::shared_ptr& /*rb*/) const override { return {}; } - NKikimr::NOlap::TColumnLoader GetColumnLoader(const ui32 columnId) const { - arrow::FieldVector v = {std::make_shared(GetColumnName(columnId), std::make_shared())}; - auto schema = std::make_shared(v); - return NKikimr::NOlap::TColumnLoader(nullptr, NSerialization::TSerializerContainer::GetDefaultSerializer(), schema, nullptr, columnId); + NKikimr::NArrow::NAccessor::TColumnLoader GetColumnLoader(const ui32 columnId) const { + return NKikimr::NArrow::NAccessor::TColumnLoader(nullptr, NSerialization::TSerializerContainer::GetDefaultSerializer(), + NKikimr::NArrow::NAccessor::TConstructorContainer::GetDefaultConstructor(), GetField(columnId), nullptr, columnId); } virtual std::shared_ptr GetField(const ui32 columnId) const override { @@ -80,9 +84,8 @@ Y_UNIT_TEST_SUITE(Splitter) { YDB_ACCESSOR(std::optional, ExpectedInternalSplitsCount, 0); public: - void Execute(std::shared_ptr batch, - const NKikimr::NOlap::NSplitter::TSplitSettings& settings = NKikimr::NOlap::NSplitter::TSplitSettings() - ) { + void Execute(std::shared_ptr batch, + const NKikimr::NOlap::NSplitter::TSplitSettings& settings = NKikimr::NOlap::NSplitter::TSplitSettings()) { using namespace NKikimr::NOlap; NKikimr::NColumnShard::TIndexationCounters counters("test"); std::vector generalSlices; @@ -93,9 +96,9 @@ Y_UNIT_TEST_SUITE(Splitter) { } } - TSimilarPacker packer(settings.GetExpectedPortionSize()); + NKikimr::NArrow::NSplitter::TSimilarPacker packer(settings.GetExpectedPortionSize()); auto packs = packer.Split(generalSlices); - const NSplitter::TEntityGroups groups(settings, "default"); + const NKikimr::NOlap::NSplitter::TEntityGroups groups(settings, "default"); const ui32 portionsCount = packs.size(); ui32 blobsCount = 0; ui32 chunksCount = 0; @@ -129,9 +132,11 @@ Y_UNIT_TEST_SUITE(Splitter) { } portionSize += bSize; AFL_VERIFY(bSize < (ui64)settings.GetMaxBlobSize()); - AFL_VERIFY(bSize * 1.01 > (ui64)settings.GetMinBlobSize() || (packs.size() == 1 && blobsLocal.size() == 1))("blob_size", bSize); + AFL_VERIFY(bSize * 1.01 > (ui64)settings.GetMinBlobSize() || (packs.size() == 1 && blobsLocal.size() == 1))( + "blob_size", bSize); } - AFL_VERIFY(portionSize >= settings.GetExpectedPortionSize() || packs.size() == 1)("size", portionSize)("limit", settings.GetMaxPortionSize()); + AFL_VERIFY(portionSize >= settings.GetExpectedPortionSize() || packs.size() == 1)("size", portionSize)( + "limit", settings.GetMaxPortionSize()); THashMap> entitiesByRecordsCount; ui32 pagesRestore = 0; @@ -141,7 +146,7 @@ Y_UNIT_TEST_SUITE(Splitter) { ui32 count = 0; for (auto&& c : e.second) { auto slice = arr->Slice(count + portionShift, c->GetRecordsCountVerified()); - auto readBatch = *Schema->GetColumnLoader(e.first).Apply(c->GetData()); + auto readBatch = Schema->GetColumnLoader(e.first).ApplyRawVerified(c->GetData()); AFL_VERIFY(slice->length() == readBatch->num_rows()); Y_ABORT_UNLESS(readBatch->column(0)->RangeEquals(*slice, 0, readBatch->num_rows(), 0, arrow::EqualOptions::Defaults())); count += c->GetRecordsCountVerified(); @@ -161,91 +166,107 @@ Y_UNIT_TEST_SUITE(Splitter) { } AFL_VERIFY(portionShift = batch->num_rows()); AFL_VERIFY(pagesSum == generalSlices.size())("sum", pagesSum)("general_slices", generalSlices.size()); - AFL_VERIFY(internalSplitsCount == ExpectedInternalSplitsCount.value_or(internalSplitsCount))("expected", *ExpectedInternalSplitsCount)("real", internalSplitsCount); + AFL_VERIFY(internalSplitsCount == ExpectedInternalSplitsCount.value_or(internalSplitsCount))( + "expected", *ExpectedInternalSplitsCount)("real", internalSplitsCount); AFL_VERIFY(blobsCount == ExpectBlobsCount.value_or(blobsCount))("blobs_count", blobsCount)("expected", *ExpectBlobsCount); AFL_VERIFY(pagesSum == ExpectSlicesCount.value_or(pagesSum))("sum", pagesSum)("expected", *ExpectSlicesCount); - AFL_VERIFY(portionsCount == ExpectPortionsCount.value_or(portionsCount))("portions_count", portionsCount)("expected", *ExpectPortionsCount); + AFL_VERIFY(portionsCount == ExpectPortionsCount.value_or(portionsCount))("portions_count", portionsCount)( + "expected", *ExpectPortionsCount); AFL_VERIFY(chunksCount == ExpectChunksCount.value_or(chunksCount))("chunks_count", chunksCount)("expected", *ExpectChunksCount); - } }; Y_UNIT_TEST(Simple) { - NConstruction::IArrayBuilder::TPtr column = std::make_shared>( - "field", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 512)); - std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({column}).BuildBatch(80048); + NConstruction::IArrayBuilder::TPtr column = + std::make_shared>( + "field", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 512)); + std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({ column }).BuildBatch(80048); NKikimr::NColumnShard::TIndexationCounters counters("test"); TSplitTester().SetExpectBlobsCount(8).SetExpectSlicesCount(8).Execute(batch); } Y_UNIT_TEST(Small) { - NConstruction::IArrayBuilder::TPtr column = std::make_shared>( - "field", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 24)); - std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({column}).BuildBatch(80048); + NConstruction::IArrayBuilder::TPtr column = + std::make_shared>( + "field", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 24)); + std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({ column }).BuildBatch(80048); NKikimr::NColumnShard::TIndexationCounters counters("test"); TSplitTester().SetExpectBlobsCount(1).SetExpectSlicesCount(8).Execute(batch); } Y_UNIT_TEST(Minimal) { - NConstruction::IArrayBuilder::TPtr column = std::make_shared>( - "field", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 512)); - std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({column}).BuildBatch(4048); + NConstruction::IArrayBuilder::TPtr column = + std::make_shared>( + "field", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 512)); + std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({ column }).BuildBatch(4048); NKikimr::NColumnShard::TIndexationCounters counters("test"); TSplitTester().SetExpectBlobsCount(1).SetExpectSlicesCount(1).Execute(batch); } Y_UNIT_TEST(Trivial) { - NConstruction::IArrayBuilder::TPtr column = std::make_shared>( - "field", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 512)); - std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({column}).BuildBatch(10048); + NConstruction::IArrayBuilder::TPtr column = + std::make_shared>( + "field", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 512)); + std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({ column }).BuildBatch(10048); TSplitTester().SetExpectBlobsCount(1).SetExpectSlicesCount(1).Execute(batch); } Y_UNIT_TEST(BigAndSmall) { - NConstruction::IArrayBuilder::TPtr columnBig = std::make_shared>( - "field1", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 512)); - NConstruction::IArrayBuilder::TPtr columnSmall = std::make_shared>( - "field2", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 1)); - std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({columnBig, columnSmall}).BuildBatch(80048); + NConstruction::IArrayBuilder::TPtr columnBig = + std::make_shared>( + "field1", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 512)); + NConstruction::IArrayBuilder::TPtr columnSmall = + std::make_shared>( + "field2", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 1)); + std::shared_ptr batch = + NKikimr::NArrow::NConstruction::TRecordBatchConstructor({ columnBig, columnSmall }).BuildBatch(80048); NKikimr::NColumnShard::TIndexationCounters counters("test"); TSplitTester().SetExpectBlobsCount(8).SetExpectSlicesCount(8).Execute(batch); } Y_UNIT_TEST(CritSmallPortions) { - NConstruction::IArrayBuilder::TPtr columnBig = std::make_shared>( - "field1", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 7120)); - NConstruction::IArrayBuilder::TPtr columnSmall = std::make_shared>( - "field2", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 128)); - std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({columnBig, columnSmall}).BuildBatch(80048); + NConstruction::IArrayBuilder::TPtr columnBig = + std::make_shared>( + "field1", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 7120)); + NConstruction::IArrayBuilder::TPtr columnSmall = + std::make_shared>( + "field2", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 128)); + std::shared_ptr batch = + NKikimr::NArrow::NConstruction::TRecordBatchConstructor({ columnBig, columnSmall }).BuildBatch(80048); NKikimr::NColumnShard::TIndexationCounters counters("test"); - TSplitTester().SetExpectBlobsCount(80).SetExpectSlicesCount(80).SetExpectedInternalSplitsCount(0).SetExpectPortionsCount(40) - .Execute(batch, NKikimr::NOlap::NSplitter::TSplitSettings().SetMinRecordsCount(1000).SetMaxPortionSize(8000000)); + TSplitTester().SetExpectBlobsCount(80).SetExpectSlicesCount(80).SetExpectedInternalSplitsCount(0).SetExpectPortionsCount(40).Execute( + batch, NKikimr::NOlap::NSplitter::TSplitSettings().SetMinRecordsCount(1000).SetMaxPortionSize(8000000)); } Y_UNIT_TEST(Crit) { - NConstruction::IArrayBuilder::TPtr columnBig = std::make_shared>( - "field1", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 7120)); - NConstruction::IArrayBuilder::TPtr columnSmall = std::make_shared>( - "field2", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 128)); - std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({columnBig, columnSmall}).BuildBatch(80048); + NConstruction::IArrayBuilder::TPtr columnBig = + std::make_shared>( + "field1", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 7120)); + NConstruction::IArrayBuilder::TPtr columnSmall = + std::make_shared>( + "field2", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 128)); + std::shared_ptr batch = + NKikimr::NArrow::NConstruction::TRecordBatchConstructor({ columnBig, columnSmall }).BuildBatch(80048); NKikimr::NColumnShard::TIndexationCounters counters("test"); - TSplitTester().SetExpectBlobsCount(80).SetExpectSlicesCount(8).SetExpectedInternalSplitsCount(8).SetExpectPortionsCount(8).Execute(batch); + TSplitTester().SetExpectBlobsCount(80).SetExpectSlicesCount(8).SetExpectedInternalSplitsCount(8).SetExpectPortionsCount(8).Execute( + batch); } Y_UNIT_TEST(CritSimple) { - NConstruction::IArrayBuilder::TPtr columnBig = std::make_shared>( - "field1", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 7120)); - std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({columnBig}).BuildBatch(80048); + NConstruction::IArrayBuilder::TPtr columnBig = + std::make_shared>( + "field1", NKikimr::NArrow::NConstruction::TStringPoolFiller(8, 7120)); + std::shared_ptr batch = NKikimr::NArrow::NConstruction::TRecordBatchConstructor({ columnBig }).BuildBatch(80048); NKikimr::NColumnShard::TIndexationCounters counters("test"); - TSplitTester().SetExpectBlobsCount(72).SetExpectSlicesCount(8).SetExpectedInternalSplitsCount(0).SetExpectPortionsCount(8).Execute(batch); + TSplitTester().SetExpectBlobsCount(72).SetExpectSlicesCount(8).SetExpectedInternalSplitsCount(0).SetExpectPortionsCount(8).Execute( + batch); } - }; diff --git a/ydb/core/tx/columnshard/splitter/ut/ya.make b/ydb/core/tx/columnshard/splitter/ut/ya.make index 24d266bffa8e..82d16868f00f 100644 --- a/ydb/core/tx/columnshard/splitter/ut/ya.make +++ b/ydb/core/tx/columnshard/splitter/ut/ya.make @@ -40,6 +40,7 @@ CFLAGS( SRCS( ut_splitter.cpp + batch_slice.cpp ) END() diff --git a/ydb/core/tx/columnshard/splitter/ya.make b/ydb/core/tx/columnshard/splitter/ya.make index 5f6c60cdf1ff..380d51bca325 100644 --- a/ydb/core/tx/columnshard/splitter/ya.make +++ b/ydb/core/tx/columnshard/splitter/ya.make @@ -3,12 +3,8 @@ LIBRARY() SRCS( batch_slice.cpp chunks.cpp - simple.cpp - similar_packer.cpp - stats.cpp column_info.cpp settings.cpp - scheme_info.cpp blob_info.cpp chunk_meta.cpp ) @@ -17,6 +13,7 @@ PEERDIR( contrib/libs/apache/arrow ydb/core/tx/columnshard/splitter/abstract ydb/core/tx/columnshard/engines/scheme + ydb/core/formats/arrow/splitter ) END() diff --git a/ydb/core/tx/columnshard/test_helper/columnshard_ut_common.h b/ydb/core/tx/columnshard/test_helper/columnshard_ut_common.h index b24b05e4ffdd..0ec1b7432ef2 100644 --- a/ydb/core/tx/columnshard/test_helper/columnshard_ut_common.h +++ b/ydb/core/tx/columnshard/test_helper/columnshard_ut_common.h @@ -167,7 +167,7 @@ struct TTestSchema { // PK firstKeyItem, TTestColumn("resource_type", TTypeInfo(NTypeIds::Utf8) ), - TTestColumn("resource_id", TTypeInfo(NTypeIds::Utf8) ), + TTestColumn("resource_id", TTypeInfo(NTypeIds::Utf8)).SetAccessorClassName("SPARSED"), TTestColumn("uid", TTypeInfo(NTypeIds::Utf8) ).SetStorageId("__MEMORY"), TTestColumn("level", TTypeInfo(NTypeIds::Int32) ), TTestColumn("message", TTypeInfo(NTypeIds::Utf8) ).SetStorageId("__MEMORY"), @@ -183,7 +183,7 @@ struct TTestSchema { std::vector schema = { // PK TTestColumn("timestamp", TTypeInfo(NTypeIds::Timestamp) ), - TTestColumn("resource_type", TTypeInfo(NTypeIds::Utf8) ), + TTestColumn("resource_type", TTypeInfo(NTypeIds::Utf8)).SetAccessorClassName("SPARSED"), TTestColumn("resource_id", TTypeInfo(NTypeIds::Utf8) ), TTestColumn("uid", TTypeInfo(NTypeIds::Utf8) ).SetStorageId("__MEMORY"), // @@ -192,7 +192,7 @@ struct TTestSchema { TTestColumn("json_payload", TTypeInfo(NTypeIds::JsonDocument) ), TTestColumn("ingested_at", TTypeInfo(NTypeIds::Timestamp) ), TTestColumn("saved_at", TTypeInfo(NTypeIds::Timestamp) ), - TTestColumn("request_id", TTypeInfo(NTypeIds::Yson) ) + TTestColumn("request_id", TTypeInfo(NTypeIds::Yson)).SetAccessorClassName("SPARSED") }; return schema; }; @@ -201,7 +201,7 @@ struct TTestSchema { std::vector schema = { TTestColumn("timestamp", TTypeInfo(NTypeIds::Timestamp) ), TTestColumn("resource_type", TTypeInfo(NTypeIds::Utf8) ).SetStorageId("__MEMORY"), - TTestColumn("resource_id", TTypeInfo(NTypeIds::Utf8) ), + TTestColumn("resource_id", TTypeInfo(NTypeIds::Utf8)).SetAccessorClassName("SPARSED"), TTestColumn("uid", TTypeInfo(NTypeIds::Utf8) ).SetStorageId("__MEMORY") }; return schema; diff --git a/ydb/core/tx/columnshard/test_helper/helper.cpp b/ydb/core/tx/columnshard/test_helper/helper.cpp index a7aca5f114ca..7c36bd7d5571 100644 --- a/ydb/core/tx/columnshard/test_helper/helper.cpp +++ b/ydb/core/tx/columnshard/test_helper/helper.cpp @@ -1,6 +1,7 @@ #include "helper.h" #include +#include #include #include #include @@ -25,6 +26,9 @@ NKikimrSchemeOp::TOlapColumnDescription TTestColumn::CreateColumn(const ui32 id) col.SetStorageId(StorageId); } auto columnType = NScheme::ProtoColumnTypeFromTypeInfoMod(Type, ""); + if (AccessorClassName) { + col.MutableDataAccessorConstructor()->SetClassName(AccessorClassName); + } col.SetTypeId(columnType.TypeId); if (columnType.TypeInfo) { *col.MutableTypeInfo() = *columnType.TypeInfo; diff --git a/ydb/core/tx/columnshard/test_helper/helper.h b/ydb/core/tx/columnshard/test_helper/helper.h index 956d962f0bd1..492b2f492297 100644 --- a/ydb/core/tx/columnshard/test_helper/helper.h +++ b/ydb/core/tx/columnshard/test_helper/helper.h @@ -51,6 +51,7 @@ class TTestColumn { YDB_ACCESSOR_DEF(TString, Name); YDB_ACCESSOR_DEF(NScheme::TTypeInfo, Type); YDB_ACCESSOR_DEF(TString, StorageId); + YDB_ACCESSOR_DEF(TString, AccessorClassName); public: explicit TTestColumn(const TString& name, const NScheme::TTypeInfo& type) diff --git a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp index 700ed7157f15..b5511676656f 100644 --- a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp +++ b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp @@ -878,8 +878,8 @@ void TestWriteRead(bool reboots, const TestTableDescription& table = {}, TString const ui64 committedBytes = reader.GetReadStat("committed_bytes"); Cerr << codec << "/" << compactedBytes << "/" << insertedBytes << "/" << committedBytes << Endl; if (insertedBytes) { - UNIT_ASSERT_GE(insertedBytes / 100000, 40); - UNIT_ASSERT_LE(insertedBytes / 100000, 50); + UNIT_ASSERT_GE(insertedBytes / 100000, 50); + UNIT_ASSERT_LE(insertedBytes / 100000, 60); } if (committedBytes) { UNIT_ASSERT_LE(committedBytes / 100000, 1); diff --git a/ydb/core/tx/schemeshard/olap/columns/update.cpp b/ydb/core/tx/schemeshard/olap/columns/update.cpp index 83e300176007..c66da237c712 100644 --- a/ydb/core/tx/schemeshard/olap/columns/update.cpp +++ b/ydb/core/tx/schemeshard/olap/columns/update.cpp @@ -119,6 +119,11 @@ namespace NKikimr::NSchemeShard { serializer.DeserializeFromProto(columnSchema.GetCompression()).Validate(); Serializer = serializer; } + if (columnSchema.HasDataAccessorConstructor()) { + NArrow::NAccessor::TConstructorContainer container; + AFL_VERIFY(container.DeserializeFromProto(columnSchema.GetDataAccessorConstructor())); + AccessorConstructor = container; + } if (columnSchema.HasDictionaryEncoding()) { auto settings = NArrow::NDictionary::TEncodingSettings::BuildFromProto(columnSchema.GetDictionaryEncoding()); Y_ABORT_UNLESS(settings.IsSuccess()); @@ -140,6 +145,9 @@ namespace NKikimr::NSchemeShard { if (Serializer) { Serializer->SerializeToProto(*columnSchema.MutableSerializer()); } + if (AccessorConstructor) { + *columnSchema.MutableDataAccessorConstructor() = AccessorConstructor.SerializeToProto(); + } if (DictionaryEncoding) { *columnSchema.MutableDictionaryEncoding() = DictionaryEncoding->SerializeToProto(); } @@ -160,6 +168,14 @@ namespace NKikimr::NSchemeShard { return false; } } + if (!!diffColumn.GetAccessorConstructor()) { + auto conclusion = diffColumn.GetAccessorConstructor()->BuildConstructor(); + if (conclusion.IsFail()) { + errors.AddError(conclusion.GetErrorMessage()); + return false; + } + AccessorConstructor = conclusion.DetachResult(); + } if (diffColumn.GetStorageId()) { StorageId = *diffColumn.GetStorageId(); } diff --git a/ydb/core/tx/schemeshard/olap/columns/update.h b/ydb/core/tx/schemeshard/olap/columns/update.h index ec463a69c7c2..84a728829d6e 100644 --- a/ydb/core/tx/schemeshard/olap/columns/update.h +++ b/ydb/core/tx/schemeshard/olap/columns/update.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -17,6 +18,7 @@ class TOlapColumnDiff { YDB_READONLY_DEF(NArrow::NDictionary::TEncodingDiff, DictionaryEncoding); YDB_READONLY_DEF(std::optional, StorageId); YDB_READONLY_DEF(std::optional, DefaultValue); + YDB_READONLY_DEF(NArrow::NAccessor::TRequestedConstructorContainer, AccessorConstructor); public: bool ParseFromRequest(const NKikimrSchemeOp::TOlapColumnDiff& columnSchema, IErrorCollector& errors) { Name = columnSchema.GetName(); @@ -30,6 +32,12 @@ class TOlapColumnDiff { if (columnSchema.HasDefaultValue()) { DefaultValue = columnSchema.GetDefaultValue(); } + if (columnSchema.HasDataAccessorConstructor()) { + if (!AccessorConstructor.DeserializeFromProto(columnSchema.GetDataAccessorConstructor())) { + errors.AddError("cannot parse accessor constructor from proto"); + return false; + } + } if (columnSchema.HasSerializer()) { if (!Serializer.DeserializeFromProto(columnSchema.GetSerializer())) { errors.AddError("cannot parse serializer diff from proto"); @@ -55,6 +63,7 @@ class TOlapColumnAdd { YDB_READONLY_DEF(std::optional, Serializer); YDB_READONLY_DEF(std::optional, DictionaryEncoding); YDB_READONLY_DEF(NOlap::TColumnDefaultScalarValue, DefaultValue); + YDB_READONLY_DEF(NArrow::NAccessor::TConstructorContainer, AccessorConstructor); public: TOlapColumnAdd(const std::optional& keyOrder) : KeyOrder(keyOrder) {