1
1
#include " accessor.h"
2
+
3
+ #include < ydb/core/formats/arrow/arrow_helpers.h>
4
+ #include < ydb/core/formats/arrow/permutations.h>
5
+ #include < ydb/core/formats/arrow/save_load/saver.h>
2
6
#include < ydb/core/formats/arrow/size_calcer.h>
7
+ #include < ydb/core/formats/arrow/splitter/simple.h>
3
8
#include < ydb/core/formats/arrow/switch/compare.h>
4
9
#include < ydb/core/formats/arrow/switch/switch_type.h>
10
+
5
11
#include < ydb/library/actors/core/log.h>
6
- #include < ydb/core/formats/arrow/permutations.h>
7
- #include < ydb/core/formats/arrow/arrow_helpers.h>
8
- #include < ydb/core/formats/arrow/splitter/simple.h>
9
- #include < ydb/core/formats/arrow/save_load/saver.h>
10
12
11
13
namespace NKikimr ::NArrow::NAccessor {
12
14
@@ -17,18 +19,18 @@ void IChunkedArray::TReader::AppendPositionTo(arrow::ArrayBuilder& builder, cons
17
19
18
20
std::shared_ptr<arrow::Array> IChunkedArray::TReader::CopyRecord (const ui64 recordIndex) const {
19
21
auto address = GetReadChunk (recordIndex);
20
- return NArrow::CopyRecords (address.GetArray (), {address.GetPosition ()});
22
+ return NArrow::CopyRecords (address.GetArray (), { address.GetPosition () });
21
23
}
22
24
23
25
std::shared_ptr<arrow::ChunkedArray> IChunkedArray::Slice (const ui32 offset, const ui32 count) const {
24
26
AFL_VERIFY (offset + count <= (ui64)GetRecordsCount ())(" offset" , offset)(" count" , count)(" length" , GetRecordsCount ());
25
27
ui32 currentOffset = offset;
26
28
ui32 countLeast = count;
27
29
std::vector<std::shared_ptr<arrow::Array>> chunks;
28
- auto address = GetChunk ({}, offset);
30
+ auto address = GetChunkSlow ( offset);
29
31
while (countLeast) {
30
- address = GetChunk (address, currentOffset);
31
- const ui64 internalPos = currentOffset - address.GetStartPosition ( );
32
+ address = GetChunk (address. GetAddress () , currentOffset);
33
+ const ui64 internalPos = address.GetAddress (). GetLocalIndex (currentOffset );
32
34
if (internalPos + countLeast <= (ui64)address.GetArray ()->length ()) {
33
35
chunks.emplace_back (address.GetArray ()->Slice (internalPos, countLeast));
34
36
break ;
@@ -43,12 +45,73 @@ std::shared_ptr<arrow::ChunkedArray> IChunkedArray::Slice(const ui32 offset, con
43
45
return std::make_shared<arrow::ChunkedArray>(chunks, DataType);
44
46
}
45
47
48
+ NKikimr::NArrow::NAccessor::IChunkedArray::TFullDataAddress IChunkedArray::GetChunk (
49
+ const std::optional<TAddressChain>& chunkCurrent, const ui64 position) const {
50
+ AFL_VERIFY (position < GetRecordsCount ());
51
+ std::optional<TCommonChunkAddress> address;
52
+
53
+ if (IsDataOwner ()) {
54
+ if (chunkCurrent) {
55
+ AFL_VERIFY (chunkCurrent->GetSize () == 1 )(" size" , chunkCurrent->GetSize ());
56
+ }
57
+ auto localAddress = GetLocalData (address, position);
58
+ TAddressChain addressChain;
59
+ addressChain.Add (localAddress.GetAddress ());
60
+ AFL_VERIFY (addressChain.Contains (position));
61
+ return TFullDataAddress (localAddress.GetArray (), std::move (addressChain));
62
+ } else {
63
+ auto chunkedArrayAddress = GetArray (chunkCurrent, position, nullptr );
64
+ if (chunkCurrent) {
65
+ AFL_VERIFY (chunkCurrent->GetSize () == 1 + chunkedArrayAddress.GetAddress ().GetSize ())(" current" , chunkCurrent->GetSize ())(
66
+ " chunked" , chunkedArrayAddress.GetAddress ().GetSize ());
67
+ }
68
+ auto localAddress = chunkedArrayAddress.GetArray ()->GetLocalData (address, chunkedArrayAddress.GetAddress ().GetLocalIndex (position));
69
+ auto fullAddress = std::move (chunkedArrayAddress.MutableAddress ());
70
+ fullAddress.Add (localAddress.GetAddress ());
71
+ AFL_VERIFY (fullAddress.Contains (position));
72
+ return TFullDataAddress (localAddress.GetArray (), std::move (fullAddress));
73
+ }
74
+ }
75
+
76
+ IChunkedArray::TFullChunkedArrayAddress IChunkedArray::GetArray (
77
+ const std::optional<TAddressChain>& chunkCurrent, const ui64 position, const std::shared_ptr<IChunkedArray>& selfPtr) const {
78
+ AFL_VERIFY (position < GetRecordsCount ());
79
+ if (IsDataOwner ()) {
80
+ AFL_VERIFY (selfPtr);
81
+ TAddressChain chain;
82
+ chain.Add (TCommonChunkAddress (0 , GetRecordsCount (), 0 ));
83
+ return IChunkedArray::TFullChunkedArrayAddress (selfPtr, std::move (chain));
84
+ }
85
+ TAddressChain addressChain;
86
+
87
+ auto * currentLevel = this ;
88
+ ui32 currentPosition = position;
89
+ ui32 idx = 0 ;
90
+ std::vector<std::shared_ptr<IChunkedArray>> chainForTemporarySave;
91
+ while (!currentLevel->IsDataOwner ()) {
92
+ std::optional<TCommonChunkAddress> currentAddress;
93
+ if (chunkCurrent) {
94
+ currentAddress = chunkCurrent->GetAddress (idx);
95
+ }
96
+ auto nextChunkedArray = currentLevel->GetLocalChunkedArray (currentAddress, currentPosition);
97
+ chainForTemporarySave.emplace_back (nextChunkedArray.GetArray ());
98
+ currentLevel = chainForTemporarySave.back ().get ();
99
+ addressChain.Add (nextChunkedArray.GetAddress ());
100
+ AFL_VERIFY (nextChunkedArray.GetAddress ().GetStartPosition () <= currentPosition);
101
+ currentPosition -= nextChunkedArray.GetAddress ().GetStartPosition ();
102
+ ++idx;
103
+ }
104
+ AFL_VERIFY (!chunkCurrent || chunkCurrent->GetSize () - idx <= 1 )(" idx" , idx)(" size" , chunkCurrent->GetSize ());
105
+ return TFullChunkedArrayAddress (chainForTemporarySave.back (), std::move (addressChain));
106
+ }
107
+
46
108
TString IChunkedArray::TReader::DebugString (const ui32 position) const {
47
109
auto address = GetReadChunk (position);
48
110
return NArrow::DebugString (address.GetArray (), address.GetPosition ());
49
111
}
50
112
51
- std::partial_ordering IChunkedArray::TReader::CompareColumns (const std::vector<TReader>& l, const ui64 lPosition, const std::vector<TReader>& r, const ui64 rPosition) {
113
+ std::partial_ordering IChunkedArray::TReader::CompareColumns (
114
+ const std::vector<TReader>& l, const ui64 lPosition, const std::vector<TReader>& r, const ui64 rPosition) {
52
115
AFL_VERIFY (l.size () == r.size ());
53
116
for (ui32 i = 0 ; i < l.size (); ++i) {
54
117
const TAddress lAddress = l[i].GetReadChunk (lPosition);
@@ -63,43 +126,38 @@ std::partial_ordering IChunkedArray::TReader::CompareColumns(const std::vector<T
63
126
64
127
IChunkedArray::TAddress IChunkedArray::TReader::GetReadChunk (const ui64 position) const {
65
128
AFL_VERIFY (position < ChunkedArray->GetRecordsCount ());
66
- if (CurrentChunkAddress && position < CurrentChunkAddress->GetStartPosition () + CurrentChunkAddress-> GetArray ()-> length () && CurrentChunkAddress-> GetStartPosition () <= position) {
129
+ if (CurrentChunkAddress && CurrentChunkAddress->GetAddress (). Contains ( position) ) {
67
130
} else {
68
- CurrentChunkAddress = ChunkedArray->DoGetChunk (CurrentChunkAddress, position);
131
+ CurrentChunkAddress = ChunkedArray->GetChunk (CurrentChunkAddress, position);
69
132
}
70
- return IChunkedArray::TAddress (CurrentChunkAddress->GetArray (), position - CurrentChunkAddress->GetStartPosition (), CurrentChunkAddress-> GetChunkIndex ( ));
133
+ return IChunkedArray::TAddress (CurrentChunkAddress->GetArray (), CurrentChunkAddress->GetAddress (). GetLocalIndex (position ));
71
134
}
72
135
73
136
const std::partial_ordering IChunkedArray::TAddress::Compare (const TAddress& item) const {
74
137
return TComparator::TypedCompare<true >(*Array, Position, *item.Array , item.Position );
75
138
}
76
139
77
- TChunkedArraySerialized::TChunkedArraySerialized (const std::shared_ptr<IChunkedArray>& array, const TString& serializedData)
140
+ TChunkedArraySerialized::TChunkedArraySerialized (const std::shared_ptr<IChunkedArray>& array, const TString& serializedData)
78
141
: Array(array)
79
142
, SerializedData(serializedData) {
80
143
AFL_VERIFY (serializedData);
81
144
AFL_VERIFY (Array);
82
145
AFL_VERIFY (Array->GetRecordsCount ());
83
146
}
84
147
85
- std::partial_ordering IChunkedArray::TCurrentChunkAddress::Compare (const ui64 position, const TCurrentChunkAddress& item, const ui64 itemPosition) const {
86
- AFL_VERIFY (GetStartPosition () <= position)(" pos" , position)(" start" , GetStartPosition ());
87
- AFL_VERIFY (position < GetFinishPosition ())(" pos" , position)(" finish" , GetFinishPosition ());
88
- AFL_VERIFY (item.GetStartPosition () <= itemPosition)(" start" , item.GetStartPosition ())(" item" , itemPosition);
89
- AFL_VERIFY (itemPosition < item.GetFinishPosition ())(" item" , itemPosition)(" finish" , item.GetFinishPosition ());
90
- return TComparator::TypedCompare<true >(*Array, position - GetStartPosition (), *item.Array , itemPosition - item.GetStartPosition ());
148
+ std::partial_ordering IChunkedArray::TFullDataAddress::Compare (
149
+ const ui64 position, const TFullDataAddress& item, const ui64 itemPosition) const {
150
+ AFL_VERIFY (Address.Contains (position))(" pos" , position)(" start" , Address.DebugString ());
151
+ AFL_VERIFY (item.Address .Contains (itemPosition))(" pos" , itemPosition)(" start" , item.Address .DebugString ());
152
+ return TComparator::TypedCompare<true >(*Array, Address.GetLocalIndex (position), *item.Array , item.Address .GetLocalIndex (itemPosition));
91
153
}
92
154
93
- std::shared_ptr<arrow::Array> IChunkedArray::TCurrentChunkAddress::CopyRecord (const ui64 recordIndex) const {
94
- AFL_VERIFY (GetStartPosition () <= recordIndex);
95
- AFL_VERIFY (recordIndex < GetFinishPosition ());
96
- return NArrow::CopyRecords (Array, { recordIndex - GetStartPosition () });
155
+ std::shared_ptr<arrow::Array> IChunkedArray::TFullDataAddress::CopyRecord (const ui64 recordIndex) const {
156
+ return NArrow::CopyRecords (Array, { Address.GetLocalIndex (recordIndex) });
97
157
}
98
158
99
- TString IChunkedArray::TCurrentChunkAddress::DebugString (const ui64 position) const {
100
- AFL_VERIFY (position < GetFinishPosition ());
101
- AFL_VERIFY (GetStartPosition () <= position);
102
- return NArrow::DebugString (Array, position - GetStartPosition ());
159
+ TString IChunkedArray::TFullDataAddress::DebugString (const ui64 position) const {
160
+ return NArrow::DebugString (Array, Address.GetLocalIndex (position));
103
161
}
104
162
105
- }
163
+ } // namespace NKikimr::NArrow::NAccessor
0 commit comments