Skip to content

Commit ae7146c

Browse files
implement sparse data accessor (#7055)
1 parent afd4240 commit ae7146c

File tree

142 files changed

+2951
-1315
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

142 files changed

+2951
-1315
lines changed

ydb/core/formats/arrow/common/accessor.cpp renamed to ydb/core/formats/arrow/accessor/abstract/accessor.cpp

+19-49
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include <ydb/library/actors/core/log.h>
66
#include <ydb/core/formats/arrow/permutations.h>
77
#include <ydb/core/formats/arrow/arrow_helpers.h>
8+
#include <ydb/core/formats/arrow/splitter/simple.h>
9+
#include <ydb/core/formats/arrow/save_load/saver.h>
810

911
namespace NKikimr::NArrow::NAccessor {
1012

@@ -72,64 +74,32 @@ const std::partial_ordering IChunkedArray::TAddress::Compare(const TAddress& ite
7274
return TComparator::TypedCompare<true>(*Array, Position, *item.Array, item.Position);
7375
}
7476

75-
namespace {
76-
class TChunkAccessor {
77-
private:
78-
std::shared_ptr<arrow::ChunkedArray> ChunkedArray;
79-
public:
80-
TChunkAccessor(const std::shared_ptr<arrow::ChunkedArray>& chunkedArray)
81-
: ChunkedArray(chunkedArray)
82-
{
83-
84-
}
85-
ui64 GetChunksCount() const {
86-
return (ui64)ChunkedArray->num_chunks();
87-
}
88-
ui64 GetChunkLength(const ui32 idx) const {
89-
return (ui64)ChunkedArray->chunk(idx)->length();
90-
}
91-
std::shared_ptr<arrow::Array> GetArray(const ui32 idx) const {
92-
return ChunkedArray->chunk(idx);
93-
}
94-
};
95-
96-
}
97-
98-
std::optional<ui64> TTrivialArray::DoGetRawSize() const {
99-
return NArrow::GetArrayDataSize(Array);
77+
TChunkedArraySerialized::TChunkedArraySerialized(const std::shared_ptr<IChunkedArray>& array, const TString& serializedData)
78+
: Array(array)
79+
, SerializedData(serializedData) {
80+
AFL_VERIFY(serializedData);
81+
AFL_VERIFY(Array);
82+
AFL_VERIFY(Array->GetRecordsCount());
10083
}
10184

10285
std::partial_ordering IChunkedArray::TCurrentChunkAddress::Compare(const ui64 position, const TCurrentChunkAddress& item, const ui64 itemPosition) const {
103-
AFL_VERIFY(StartPosition <= position);
104-
AFL_VERIFY(position < FinishPosition);
105-
AFL_VERIFY(item.StartPosition <= itemPosition);
106-
AFL_VERIFY(itemPosition < item.FinishPosition);
107-
return TComparator::TypedCompare<true>(*Array, position - StartPosition, *item.Array, itemPosition - item.StartPosition);
86+
AFL_VERIFY(GetStartPosition() <= position)("pos", position)("start", GetStartPosition());
87+
AFL_VERIFY(position < GetFinishPosition())("pos", position)("finish", GetFinishPosition());
88+
AFL_VERIFY(item.GetStartPosition() <= itemPosition)("start", item.GetStartPosition())("item", itemPosition);
89+
AFL_VERIFY(itemPosition < item.GetFinishPosition())("item", itemPosition)("finish", item.GetFinishPosition());
90+
return TComparator::TypedCompare<true>(*Array, position - GetStartPosition(), *item.Array, itemPosition - item.GetStartPosition());
10891
}
10992

11093
std::shared_ptr<arrow::Array> IChunkedArray::TCurrentChunkAddress::CopyRecord(const ui64 recordIndex) const {
111-
AFL_VERIFY(StartPosition <= recordIndex);
112-
AFL_VERIFY(recordIndex < FinishPosition);
113-
return NArrow::CopyRecords(Array, { recordIndex - StartPosition });
94+
AFL_VERIFY(GetStartPosition() <= recordIndex);
95+
AFL_VERIFY(recordIndex < GetFinishPosition());
96+
return NArrow::CopyRecords(Array, { recordIndex - GetStartPosition() });
11497
}
11598

11699
TString IChunkedArray::TCurrentChunkAddress::DebugString(const ui64 position) const {
117-
AFL_VERIFY(position < FinishPosition);
118-
AFL_VERIFY(StartPosition <= position);
119-
return NArrow::DebugString(Array, position - StartPosition);
120-
}
121-
122-
IChunkedArray::TCurrentChunkAddress TTrivialChunkedArray::DoGetChunk(const std::optional<TCurrentChunkAddress>& chunkCurrent, const ui64 position) const {
123-
TChunkAccessor accessor(Array);
124-
return SelectChunk(chunkCurrent, position, accessor);
125-
}
126-
127-
std::optional<ui64> TTrivialChunkedArray::DoGetRawSize() const {
128-
ui64 result = 0;
129-
for (auto&& i : Array->chunks()) {
130-
result += NArrow::GetArrayDataSize(i);
131-
}
132-
return result;
100+
AFL_VERIFY(position < GetFinishPosition());
101+
AFL_VERIFY(GetStartPosition() <= position);
102+
return NArrow::DebugString(Array, position - GetStartPosition());
133103
}
134104

135105
}

0 commit comments

Comments
 (0)