2
2
#include " position.h"
3
3
#include " heap.h"
4
4
#include " result_builder.h"
5
+ #include " batch_iterator.h"
5
6
6
7
#include < ydb/core/formats/arrow/arrow_filter.h>
7
8
@@ -20,135 +21,6 @@ class TMergePartialStream {
20
21
const std::vector<std::string> VersionColumnNames;
21
22
ui32 ControlPoints = 0 ;
22
23
23
- class TBatchIterator {
24
- private:
25
- bool ControlPointFlag;
26
- TSortableBatchPosition KeyColumns;
27
- TSortableBatchPosition VersionColumns;
28
- i64 RecordsCount;
29
- int ReverseSortKff;
30
-
31
- std::shared_ptr<NArrow::TColumnFilter> Filter;
32
- std::shared_ptr<NArrow::TColumnFilter::TIterator> FilterIterator;
33
-
34
- i32 GetFirstPosition () const {
35
- if (ReverseSortKff > 0 ) {
36
- return 0 ;
37
- } else {
38
- return RecordsCount - 1 ;
39
- }
40
- }
41
-
42
- public:
43
- NJson::TJsonValue DebugJson () const ;
44
-
45
- const std::shared_ptr<NArrow::TColumnFilter>& GetFilter () const {
46
- return Filter;
47
- }
48
-
49
- bool IsControlPoint () const {
50
- return ControlPointFlag;
51
- }
52
-
53
- const TSortableBatchPosition& GetKeyColumns () const {
54
- return KeyColumns;
55
- }
56
-
57
- const TSortableBatchPosition& GetVersionColumns () const {
58
- return VersionColumns;
59
- }
60
-
61
- TBatchIterator (const TSortableBatchPosition& keyColumns)
62
- : ControlPointFlag(true )
63
- , KeyColumns(keyColumns)
64
- {
65
-
66
- }
67
-
68
- template <class TDataContainer >
69
- TBatchIterator (std::shared_ptr<TDataContainer> batch, std::shared_ptr<NArrow::TColumnFilter> filter,
70
- const std::vector<std::string>& keyColumns, const std::vector<std::string>& dataColumns, const bool reverseSort, const std::vector<std::string>& versionColumnNames)
71
- : ControlPointFlag(false )
72
- , KeyColumns(batch, 0 , keyColumns, dataColumns, reverseSort)
73
- , VersionColumns(batch, 0 , versionColumnNames, {}, false )
74
- , RecordsCount(batch->num_rows ())
75
- , ReverseSortKff(reverseSort ? -1 : 1 )
76
- , Filter(filter)
77
- {
78
- Y_ABORT_UNLESS (KeyColumns.InitPosition (GetFirstPosition ()));
79
- Y_ABORT_UNLESS (VersionColumns.InitPosition (GetFirstPosition ()));
80
- if (Filter) {
81
- FilterIterator = std::make_shared<NArrow::TColumnFilter::TIterator>(Filter->GetIterator (reverseSort, RecordsCount));
82
- }
83
- }
84
-
85
- bool CheckNextBatch (const TBatchIterator& nextIterator) {
86
- return KeyColumns.Compare (nextIterator.KeyColumns ) == std::partial_ordering::less;
87
- }
88
-
89
- bool IsReverse () const {
90
- return ReverseSortKff < 0 ;
91
- }
92
-
93
- bool IsDeleted () const {
94
- if (!FilterIterator) {
95
- return false ;
96
- }
97
- return !FilterIterator->GetCurrentAcceptance ();
98
- }
99
-
100
- TSortableBatchPosition::TFoundPosition SkipToLower (const TSortableBatchPosition& pos) {
101
- const ui32 posStart = KeyColumns.GetPosition ();
102
- auto result = KeyColumns.SkipToLower (pos);
103
- const i32 delta = IsReverse () ? (posStart - KeyColumns.GetPosition ()) : (KeyColumns.GetPosition () - posStart);
104
- AFL_VERIFY (delta >= 0 );
105
- AFL_VERIFY (VersionColumns.InitPosition (KeyColumns.GetPosition ()))(" pos" , KeyColumns.GetPosition ())(" size" , VersionColumns.GetRecordsCount ());
106
- if (FilterIterator && delta) {
107
- AFL_VERIFY (FilterIterator->Next (delta));
108
- }
109
- return result;
110
- }
111
-
112
- bool Next () {
113
- const bool result = KeyColumns.NextPosition (ReverseSortKff) && VersionColumns.NextPosition (ReverseSortKff);
114
- if (FilterIterator) {
115
- Y_ABORT_UNLESS (result == FilterIterator->Next (1 ));
116
- }
117
- return result;
118
- }
119
-
120
- bool operator <(const TBatchIterator& item) const {
121
- const std::partial_ordering result = KeyColumns.Compare (item.KeyColumns );
122
- if (result == std::partial_ordering::equivalent) {
123
- if (IsControlPoint () && item.IsControlPoint ()) {
124
- return false ;
125
- } else if (IsControlPoint ()) {
126
- return false ;
127
- } else if (item.IsControlPoint ()) {
128
- return true ;
129
- }
130
- // don't need inverse through we need maximal version at first (reverse analytic not included in VersionColumns)
131
- return VersionColumns.Compare (item.VersionColumns ) == std::partial_ordering::less;
132
- } else {
133
- // inverse logic through we use max heap, but need minimal element if not reverse (reverse analytic included in KeyColumns)
134
- return result == std::partial_ordering::greater;
135
- }
136
- }
137
- };
138
-
139
- class TIteratorData {
140
- private:
141
- YDB_READONLY_DEF (std::shared_ptr<arrow::RecordBatch>, Batch);
142
- YDB_READONLY_DEF (std::shared_ptr<NArrow::TColumnFilter>, Filter);
143
- public:
144
- TIteratorData (std::shared_ptr<arrow::RecordBatch> batch, std::shared_ptr<NArrow::TColumnFilter> filter)
145
- : Batch(batch)
146
- , Filter(filter)
147
- {
148
-
149
- }
150
- };
151
-
152
24
TSortingHeap<TBatchIterator> SortHeap;
153
25
154
26
NJson::TJsonValue DebugJson () const {
@@ -164,9 +36,6 @@ class TMergePartialStream {
164
36
165
37
std::optional<TSortableBatchPosition> DrainCurrentPosition ();
166
38
167
- void AddNewToHeap (std::shared_ptr<arrow::RecordBatch> batch, std::shared_ptr<NArrow::TColumnFilter> filter);
168
- void AddNewToHeap (std::shared_ptr<arrow::Table> batch, std::shared_ptr<NArrow::TColumnFilter> filter);
169
- void AddNewToHeap (std::shared_ptr<TGeneralContainer> batch, std::shared_ptr<NArrow::TColumnFilter> filter);
170
39
void CheckSequenceInDebug (const TSortableBatchPosition& nextKeyColumnsPosition);
171
40
public:
172
41
TMergePartialStream (std::shared_ptr<arrow::Schema> sortSchema, std::shared_ptr<arrow::Schema> dataSchema, const bool reverse, const std::vector<std::string>& versionColumnNames)
@@ -233,9 +102,18 @@ class TMergePartialStream {
233
102
return SortHeap.Size () && SortHeap.Current ().IsControlPoint ();
234
103
}
235
104
236
- void AddSource (std::shared_ptr<arrow::RecordBatch> batch, std::shared_ptr<NArrow::TColumnFilter> filter);
237
- void AddSource (std::shared_ptr<arrow::Table> batch, std::shared_ptr<NArrow::TColumnFilter> filter);
238
- void AddSource (std::shared_ptr<TGeneralContainer> batch, std::shared_ptr<NArrow::TColumnFilter> filter);
105
+ template <class TDataContainer >
106
+ void AddSource (const std::shared_ptr<TDataContainer>& batch, const std::shared_ptr<NArrow::TColumnFilter>& filter) {
107
+ if (!batch || !batch->num_rows ()) {
108
+ return ;
109
+ }
110
+ if (filter && filter->IsTotalDenyFilter ()) {
111
+ return ;
112
+ }
113
+ // Y_DEBUG_ABORT_UNLESS(NArrow::IsSorted(batch, SortSchema));
114
+ auto filterImpl = (!filter || filter->IsTotalAllowFilter ()) ? nullptr : filter;
115
+ SortHeap.Push (TBatchIterator (batch, filterImpl, SortSchema->field_names (), DataSchema ? DataSchema->field_names () : std::vector<std::string>(), Reverse, VersionColumnNames));
116
+ }
239
117
240
118
bool IsEmpty () const {
241
119
return !SortHeap.Size ();
0 commit comments