@@ -10,49 +10,72 @@ namespace {
10
10
11
11
TString LogPrefix = " JsonParser: " ;
12
12
13
- } // anonymous namespace
13
+ struct TJsonParserBuffer {
14
+ size_t NumberValues = 0 ;
15
+ bool Finished = false ;
16
+ TInstant CreationStartTime = TInstant::Now();
17
+ TVector<ui64> Offsets = {};
18
+
19
+ bool IsReady () const {
20
+ return !Finished && NumberValues > 0 ;
21
+ }
14
22
15
- namespace NFq {
23
+ size_t GetSize () const {
24
+ return Values.size ();
25
+ }
16
26
17
- // // TParserBuffer
27
+ void Reserve (size_t size, size_t numberValues) {
28
+ Values.reserve (2 * (size + simdjson::SIMDJSON_PADDING));
29
+ Offsets.reserve (numberValues);
30
+ }
18
31
19
- TJsonParserBuffer::TJsonParserBuffer ()
20
- : NumberValues(0 )
21
- , Finished(false )
22
- {}
32
+ void AddMessages (const TVector<NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage>& messages) {
33
+ Y_ENSURE (!Finished, " Cannot add messages into finished buffer" );
23
34
24
- void TJsonParserBuffer::Reserve ( size_t size) {
25
- Y_ENSURE (!Finished, " Cannot reserve finished buffer " );
26
- Values. reserve ( 2 * (size + simdjson::SIMDJSON_PADDING) );
27
- }
35
+ size_t messagesSize = 0 ;
36
+ for ( const auto & message : messages) {
37
+ messagesSize += message. GetData (). size ( );
38
+ }
28
39
29
- void TJsonParserBuffer::AddValue (const TString& value) {
30
- Y_ENSURE (!Finished, " Cannot add value into finished buffer" );
31
- NumberValues++;
32
- Values << value;
33
- }
40
+ NumberValues += messages.size ();
41
+ Reserve (Values.size () + messagesSize, NumberValues);
42
+ for (const auto & message : messages) {
43
+ Values << message.GetData ();
44
+ Offsets.emplace_back (message.GetOffset ());
45
+ }
46
+ }
34
47
35
- std::string_view TJsonParserBuffer:: AddHolder (std::string_view value) {
36
- Y_ENSURE (Values.size () + value.size () <= Values.capacity (), " Requested too large holders" );
37
- const size_t startPos = Values.size ();
38
- Values << value;
39
- return std::string_view (Values).substr (startPos, value.length ());
40
- }
48
+ std::string_view AddHolder (std::string_view value) {
49
+ Y_ENSURE (Values.size () + value.size () <= Values.capacity (), " Requested too large holders" );
50
+ const size_t startPos = Values.size ();
51
+ Values << value;
52
+ return std::string_view (Values).substr (startPos, value.length ());
53
+ }
41
54
42
- std::pair<const char *, size_t > TJsonParserBuffer:: Finish () {
43
- Y_ENSURE (!Finished, " Cannot finish buffer twice" );
44
- Finished = true ;
45
- Values << TString (simdjson::SIMDJSON_PADDING, ' ' );
46
- Values.reserve (2 * Values.size ());
47
- return {Values.data (), Values.size ()};
48
- }
55
+ std::pair<const char *, size_t > Finish () {
56
+ Y_ENSURE (!Finished, " Cannot finish buffer twice" );
57
+ Finished = true ;
58
+ Values << TString (simdjson::SIMDJSON_PADDING, ' ' );
59
+ Values.reserve (2 * Values.size ());
60
+ return {Values.data (), Values.size ()};
61
+ }
49
62
50
- void TJsonParserBuffer::Clear () {
51
- Y_ENSURE (Finished, " Cannot clear not finished buffer" );
52
- NumberValues = 0 ;
53
- Finished = false ;
54
- Values.clear ();
55
- }
63
+ void Clear () {
64
+ Y_ENSURE (Finished, " Cannot clear not finished buffer" );
65
+ NumberValues = 0 ;
66
+ Finished = false ;
67
+ CreationStartTime = TInstant::Now ();
68
+ Values.clear ();
69
+ Offsets.clear ();
70
+ }
71
+
72
+ private:
73
+ TStringBuilder Values = {};
74
+ };
75
+
76
+ } // anonymous namespace
77
+
78
+ namespace NFq {
56
79
57
80
// // TJsonParser
58
81
@@ -63,10 +86,13 @@ class TJsonParser::TImpl {
63
86
};
64
87
65
88
public:
66
- TImpl (const TVector<TString>& columns, const TVector<TString>& types)
67
- : ParsedValues(columns.size())
89
+ TImpl (const TVector<TString>& columns, const TVector<TString>& types, ui64 batchSize, TDuration batchCreationTimeout)
90
+ : BatchSize(batchSize)
91
+ , BatchCreationTimeout(batchCreationTimeout)
92
+ , ParsedValues(columns.size())
68
93
{
69
94
Y_ENSURE (columns.size () == types.size (), " Number of columns and types should by equal" );
95
+ LOG_ROW_DISPATCHER_INFO (" Simdjson active implementation " << simdjson::get_active_implementation ()->name ());
70
96
71
97
Columns.reserve (columns.size ());
72
98
for (size_t i = 0 ; i < columns.size (); i++) {
@@ -80,22 +106,51 @@ class TJsonParser::TImpl {
80
106
for (size_t i = 0 ; i < columns.size (); i++) {
81
107
ColumnsIndex.emplace (std::string_view (Columns[i].Name ), i);
82
108
}
109
+
110
+ Buffer.Reserve (BatchSize, 1 );
111
+ Parser.threaded = false ;
112
+ }
113
+
114
+ bool IsReady () const {
115
+ return Buffer.IsReady () && (Buffer.GetSize () >= BatchSize || TInstant::Now () - Buffer.CreationStartTime >= BatchCreationTimeout);
116
+ }
117
+
118
+ TInstant GetCreationDeadline () const {
119
+ return Buffer.IsReady () ? Buffer.CreationStartTime + BatchCreationTimeout : TInstant::Zero ();
120
+ }
121
+
122
+ size_t GetNumberValues () const {
123
+ return Buffer.IsReady () ? Buffer.NumberValues : 0 ;
124
+ }
125
+
126
+ const TVector<ui64>& GetOffsets () {
127
+ return Buffer.Offsets ;
128
+ }
129
+
130
+ void AddMessages (const TVector<NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage>& messages) {
131
+ if (messages.empty ()) {
132
+ return ;
133
+ }
134
+
135
+ if (Buffer.Finished ) {
136
+ Buffer.Clear ();
137
+ }
138
+ Buffer.AddMessages (messages);
83
139
}
84
140
85
141
const TVector<TVector<std::string_view>>& Parse () {
142
+ Y_ENSURE (Buffer.IsReady (), " Nothing to parse" );
143
+
86
144
const auto [values, size] = Buffer.Finish ();
87
145
LOG_ROW_DISPATCHER_TRACE (" Parse values:\n " << values);
88
146
89
147
for (auto & parsedColumn : ParsedValues) {
90
148
parsedColumn.clear ();
91
- parsedColumn.reserve (Buffer.GetNumberValues () );
149
+ parsedColumn.reserve (Buffer.NumberValues );
92
150
}
93
151
94
- simdjson::ondemand::parser parser;
95
- parser.threaded = false ;
96
-
97
152
size_t rowId = 0 ;
98
- simdjson::ondemand::document_stream documents = parser .iterate_many (values, size, simdjson::dom::DEFAULT_BATCH_SIZE);
153
+ simdjson::ondemand::document_stream documents = Parser .iterate_many (values, size, simdjson::dom::DEFAULT_BATCH_SIZE);
99
154
for (auto document : documents) {
100
155
for (auto item : document.get_object ()) {
101
156
const auto it = ColumnsIndex.find (item.escaped_key ().value ());
@@ -126,27 +181,20 @@ class TJsonParser::TImpl {
126
181
}
127
182
rowId++;
128
183
}
129
- Y_ENSURE (rowId == Buffer.GetNumberValues () , " Unexpected number of json documents" );
184
+ Y_ENSURE (rowId == Buffer.NumberValues , " Unexpected number of json documents" );
130
185
131
186
for (auto & parsedColumn : ParsedValues) {
132
- parsedColumn.resize (Buffer.GetNumberValues () );
187
+ parsedColumn.resize (Buffer.NumberValues );
133
188
}
134
189
return ParsedValues;
135
190
}
136
191
137
- TJsonParserBuffer& GetBuffer () {
138
- if (Buffer.GetFinished ()) {
139
- Buffer.Clear ();
140
- }
141
- return Buffer;
142
- }
143
-
144
192
TString GetDescription () const {
145
193
TStringBuilder description = TStringBuilder () << " Columns: " ;
146
194
for (const auto & column : Columns) {
147
195
description << " '" << column.Name << " ':" << column.Type << " " ;
148
196
}
149
- description << " \n Buffer size: " << Buffer.GetNumberValues () << " , finished: " << Buffer.GetFinished () ;
197
+ description << " \n Number values in buffer: " << Buffer. NumberValues << " , buffer size: " << Buffer.GetSize () << " , finished: " << Buffer.Finished ;
150
198
return description;
151
199
}
152
200
@@ -182,22 +230,42 @@ class TJsonParser::TImpl {
182
230
}
183
231
184
232
private:
233
+ const ui64 BatchSize;
234
+ const TDuration BatchCreationTimeout;
185
235
TVector<TColumnDescription> Columns;
186
236
absl::flat_hash_map<std::string_view, size_t > ColumnsIndex;
187
237
188
238
TJsonParserBuffer Buffer;
239
+ simdjson::ondemand::parser Parser;
240
+
189
241
TVector<TVector<std::string_view>> ParsedValues;
190
242
};
191
243
192
- TJsonParser::TJsonParser (const TVector<TString>& columns, const TVector<TString>& types)
193
- : Impl(std::make_unique<TJsonParser::TImpl>(columns, types))
244
+ TJsonParser::TJsonParser (const TVector<TString>& columns, const TVector<TString>& types, ui64 batchSize, TDuration batchCreationTimeout )
245
+ : Impl(std::make_unique<TJsonParser::TImpl>(columns, types, batchSize, batchCreationTimeout ))
194
246
{}
195
247
196
248
TJsonParser::~TJsonParser () {
197
249
}
198
250
199
- TJsonParserBuffer& TJsonParser::GetBuffer () {
200
- return Impl->GetBuffer ();
251
+ void TJsonParser::AddMessages (const TVector<NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage>& messages) {
252
+ Impl->AddMessages (messages);
253
+ }
254
+
255
+ bool TJsonParser::IsReady () const {
256
+ return Impl->IsReady ();
257
+ }
258
+
259
+ TInstant TJsonParser::GetCreationDeadline () const {
260
+ return Impl->GetCreationDeadline ();
261
+ }
262
+
263
+ size_t TJsonParser::GetNumberValues () const {
264
+ return Impl->GetNumberValues ();
265
+ }
266
+
267
+ const TVector<ui64>& TJsonParser::GetOffsets () const {
268
+ return Impl->GetOffsets ();
201
269
}
202
270
203
271
const TVector<TVector<std::string_view>>& TJsonParser::Parse () {
@@ -212,8 +280,8 @@ TString TJsonParser::GetDebugString(const TVector<TVector<std::string_view>>& pa
212
280
return Impl->GetDebugString (parsedValues);
213
281
}
214
282
215
- std::unique_ptr<TJsonParser> NewJsonParser (const TVector<TString>& columns, const TVector<TString>& types) {
216
- return std::unique_ptr<TJsonParser>(new TJsonParser (columns, types));
283
+ std::unique_ptr<TJsonParser> NewJsonParser (const TVector<TString>& columns, const TVector<TString>& types, ui64 batchSize, TDuration batchCreationTimeout ) {
284
+ return std::unique_ptr<TJsonParser>(new TJsonParser (columns, types, batchSize, batchCreationTimeout ));
217
285
}
218
286
219
287
} // namespace NFq
0 commit comments