Skip to content

Commit 46db233

Browse files
additional test and addressing fixes (#7168)
1 parent 4743435 commit 46db233

File tree

23 files changed

+516
-285
lines changed

23 files changed

+516
-285
lines changed
Lines changed: 86 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
#include "accessor.h"
2+
3+
#include <ydb/core/formats/arrow/arrow_helpers.h>
4+
#include <ydb/core/formats/arrow/permutations.h>
5+
#include <ydb/core/formats/arrow/save_load/saver.h>
26
#include <ydb/core/formats/arrow/size_calcer.h>
7+
#include <ydb/core/formats/arrow/splitter/simple.h>
38
#include <ydb/core/formats/arrow/switch/compare.h>
49
#include <ydb/core/formats/arrow/switch/switch_type.h>
10+
511
#include <ydb/library/actors/core/log.h>
6-
#include <ydb/core/formats/arrow/permutations.h>
7-
#include <ydb/core/formats/arrow/arrow_helpers.h>
8-
#include <ydb/core/formats/arrow/splitter/simple.h>
9-
#include <ydb/core/formats/arrow/save_load/saver.h>
1012

1113
namespace NKikimr::NArrow::NAccessor {
1214

@@ -17,18 +19,18 @@ void IChunkedArray::TReader::AppendPositionTo(arrow::ArrayBuilder& builder, cons
1719

1820
std::shared_ptr<arrow::Array> IChunkedArray::TReader::CopyRecord(const ui64 recordIndex) const {
1921
auto address = GetReadChunk(recordIndex);
20-
return NArrow::CopyRecords(address.GetArray(), {address.GetPosition()});
22+
return NArrow::CopyRecords(address.GetArray(), { address.GetPosition() });
2123
}
2224

2325
std::shared_ptr<arrow::ChunkedArray> IChunkedArray::Slice(const ui32 offset, const ui32 count) const {
2426
AFL_VERIFY(offset + count <= (ui64)GetRecordsCount())("offset", offset)("count", count)("length", GetRecordsCount());
2527
ui32 currentOffset = offset;
2628
ui32 countLeast = count;
2729
std::vector<std::shared_ptr<arrow::Array>> chunks;
28-
auto address = GetChunk({}, offset);
30+
auto address = GetChunkSlow(offset);
2931
while (countLeast) {
30-
address = GetChunk(address, currentOffset);
31-
const ui64 internalPos = currentOffset - address.GetStartPosition();
32+
address = GetChunk(address.GetAddress(), currentOffset);
33+
const ui64 internalPos = address.GetAddress().GetLocalIndex(currentOffset);
3234
if (internalPos + countLeast <= (ui64)address.GetArray()->length()) {
3335
chunks.emplace_back(address.GetArray()->Slice(internalPos, countLeast));
3436
break;
@@ -43,12 +45,73 @@ std::shared_ptr<arrow::ChunkedArray> IChunkedArray::Slice(const ui32 offset, con
4345
return std::make_shared<arrow::ChunkedArray>(chunks, DataType);
4446
}
4547

48+
NKikimr::NArrow::NAccessor::IChunkedArray::TFullDataAddress IChunkedArray::GetChunk(
49+
const std::optional<TAddressChain>& chunkCurrent, const ui64 position) const {
50+
AFL_VERIFY(position < GetRecordsCount());
51+
std::optional<TCommonChunkAddress> address;
52+
53+
if (IsDataOwner()) {
54+
if (chunkCurrent) {
55+
AFL_VERIFY(chunkCurrent->GetSize() == 1)("size", chunkCurrent->GetSize());
56+
}
57+
auto localAddress = GetLocalData(address, position);
58+
TAddressChain addressChain;
59+
addressChain.Add(localAddress.GetAddress());
60+
AFL_VERIFY(addressChain.Contains(position));
61+
return TFullDataAddress(localAddress.GetArray(), std::move(addressChain));
62+
} else {
63+
auto chunkedArrayAddress = GetArray(chunkCurrent, position, nullptr);
64+
if (chunkCurrent) {
65+
AFL_VERIFY(chunkCurrent->GetSize() == 1 + chunkedArrayAddress.GetAddress().GetSize())("current", chunkCurrent->GetSize())(
66+
"chunked", chunkedArrayAddress.GetAddress().GetSize());
67+
}
68+
auto localAddress = chunkedArrayAddress.GetArray()->GetLocalData(address, chunkedArrayAddress.GetAddress().GetLocalIndex(position));
69+
auto fullAddress = std::move(chunkedArrayAddress.MutableAddress());
70+
fullAddress.Add(localAddress.GetAddress());
71+
AFL_VERIFY(fullAddress.Contains(position));
72+
return TFullDataAddress(localAddress.GetArray(), std::move(fullAddress));
73+
}
74+
}
75+
76+
IChunkedArray::TFullChunkedArrayAddress IChunkedArray::GetArray(
77+
const std::optional<TAddressChain>& chunkCurrent, const ui64 position, const std::shared_ptr<IChunkedArray>& selfPtr) const {
78+
AFL_VERIFY(position < GetRecordsCount());
79+
if (IsDataOwner()) {
80+
AFL_VERIFY(selfPtr);
81+
TAddressChain chain;
82+
chain.Add(TCommonChunkAddress(0, GetRecordsCount(), 0));
83+
return IChunkedArray::TFullChunkedArrayAddress(selfPtr, std::move(chain));
84+
}
85+
TAddressChain addressChain;
86+
87+
auto* currentLevel = this;
88+
ui32 currentPosition = position;
89+
ui32 idx = 0;
90+
std::vector<std::shared_ptr<IChunkedArray>> chainForTemporarySave;
91+
while (!currentLevel->IsDataOwner()) {
92+
std::optional<TCommonChunkAddress> currentAddress;
93+
if (chunkCurrent) {
94+
currentAddress = chunkCurrent->GetAddress(idx);
95+
}
96+
auto nextChunkedArray = currentLevel->GetLocalChunkedArray(currentAddress, currentPosition);
97+
chainForTemporarySave.emplace_back(nextChunkedArray.GetArray());
98+
currentLevel = chainForTemporarySave.back().get();
99+
addressChain.Add(nextChunkedArray.GetAddress());
100+
AFL_VERIFY(nextChunkedArray.GetAddress().GetStartPosition() <= currentPosition);
101+
currentPosition -= nextChunkedArray.GetAddress().GetStartPosition();
102+
++idx;
103+
}
104+
AFL_VERIFY(!chunkCurrent || chunkCurrent->GetSize() - idx <= 1)("idx", idx)("size", chunkCurrent->GetSize());
105+
return TFullChunkedArrayAddress(chainForTemporarySave.back(), std::move(addressChain));
106+
}
107+
46108
TString IChunkedArray::TReader::DebugString(const ui32 position) const {
47109
auto address = GetReadChunk(position);
48110
return NArrow::DebugString(address.GetArray(), address.GetPosition());
49111
}
50112

51-
std::partial_ordering IChunkedArray::TReader::CompareColumns(const std::vector<TReader>& l, const ui64 lPosition, const std::vector<TReader>& r, const ui64 rPosition) {
113+
std::partial_ordering IChunkedArray::TReader::CompareColumns(
114+
const std::vector<TReader>& l, const ui64 lPosition, const std::vector<TReader>& r, const ui64 rPosition) {
52115
AFL_VERIFY(l.size() == r.size());
53116
for (ui32 i = 0; i < l.size(); ++i) {
54117
const TAddress lAddress = l[i].GetReadChunk(lPosition);
@@ -63,43 +126,38 @@ std::partial_ordering IChunkedArray::TReader::CompareColumns(const std::vector<T
63126

64127
IChunkedArray::TAddress IChunkedArray::TReader::GetReadChunk(const ui64 position) const {
65128
AFL_VERIFY(position < ChunkedArray->GetRecordsCount());
66-
if (CurrentChunkAddress && position < CurrentChunkAddress->GetStartPosition() + CurrentChunkAddress->GetArray()->length() && CurrentChunkAddress->GetStartPosition() <= position) {
129+
if (CurrentChunkAddress && CurrentChunkAddress->GetAddress().Contains(position)) {
67130
} else {
68-
CurrentChunkAddress = ChunkedArray->DoGetChunk(CurrentChunkAddress, position);
131+
CurrentChunkAddress = ChunkedArray->GetChunk(CurrentChunkAddress, position);
69132
}
70-
return IChunkedArray::TAddress(CurrentChunkAddress->GetArray(), position - CurrentChunkAddress->GetStartPosition(), CurrentChunkAddress->GetChunkIndex());
133+
return IChunkedArray::TAddress(CurrentChunkAddress->GetArray(), CurrentChunkAddress->GetAddress().GetLocalIndex(position));
71134
}
72135

73136
const std::partial_ordering IChunkedArray::TAddress::Compare(const TAddress& item) const {
74137
return TComparator::TypedCompare<true>(*Array, Position, *item.Array, item.Position);
75138
}
76139

77-
TChunkedArraySerialized::TChunkedArraySerialized(const std::shared_ptr<IChunkedArray>& array, const TString& serializedData)
140+
TChunkedArraySerialized::TChunkedArraySerialized(const std::shared_ptr<IChunkedArray>& array, const TString& serializedData)
78141
: Array(array)
79142
, SerializedData(serializedData) {
80143
AFL_VERIFY(serializedData);
81144
AFL_VERIFY(Array);
82145
AFL_VERIFY(Array->GetRecordsCount());
83146
}
84147

85-
std::partial_ordering IChunkedArray::TCurrentChunkAddress::Compare(const ui64 position, const TCurrentChunkAddress& item, const ui64 itemPosition) const {
86-
AFL_VERIFY(GetStartPosition() <= position)("pos", position)("start", GetStartPosition());
87-
AFL_VERIFY(position < GetFinishPosition())("pos", position)("finish", GetFinishPosition());
88-
AFL_VERIFY(item.GetStartPosition() <= itemPosition)("start", item.GetStartPosition())("item", itemPosition);
89-
AFL_VERIFY(itemPosition < item.GetFinishPosition())("item", itemPosition)("finish", item.GetFinishPosition());
90-
return TComparator::TypedCompare<true>(*Array, position - GetStartPosition(), *item.Array, itemPosition - item.GetStartPosition());
148+
std::partial_ordering IChunkedArray::TFullDataAddress::Compare(
149+
const ui64 position, const TFullDataAddress& item, const ui64 itemPosition) const {
150+
AFL_VERIFY(Address.Contains(position))("pos", position)("start", Address.DebugString());
151+
AFL_VERIFY(item.Address.Contains(itemPosition))("pos", itemPosition)("start", item.Address.DebugString());
152+
return TComparator::TypedCompare<true>(*Array, Address.GetLocalIndex(position), *item.Array, item.Address.GetLocalIndex(itemPosition));
91153
}
92154

93-
std::shared_ptr<arrow::Array> IChunkedArray::TCurrentChunkAddress::CopyRecord(const ui64 recordIndex) const {
94-
AFL_VERIFY(GetStartPosition() <= recordIndex);
95-
AFL_VERIFY(recordIndex < GetFinishPosition());
96-
return NArrow::CopyRecords(Array, { recordIndex - GetStartPosition() });
155+
std::shared_ptr<arrow::Array> IChunkedArray::TFullDataAddress::CopyRecord(const ui64 recordIndex) const {
156+
return NArrow::CopyRecords(Array, { Address.GetLocalIndex(recordIndex) });
97157
}
98158

99-
TString IChunkedArray::TCurrentChunkAddress::DebugString(const ui64 position) const {
100-
AFL_VERIFY(position < GetFinishPosition());
101-
AFL_VERIFY(GetStartPosition() <= position);
102-
return NArrow::DebugString(Array, position - GetStartPosition());
159+
TString IChunkedArray::TFullDataAddress::DebugString(const ui64 position) const {
160+
return NArrow::DebugString(Array, Address.GetLocalIndex(position));
103161
}
104162

105-
}
163+
} // namespace NKikimr::NArrow::NAccessor

0 commit comments

Comments
 (0)