Skip to content

Commit 4a5e2ba

Browse files
authored
Wilson parallel upload (#1875)
1 parent c2fd5b9 commit 4a5e2ba

File tree

6 files changed

+61
-26
lines changed

6 files changed

+61
-26
lines changed

ydb/core/driver_lib/run/kikimr_services_initializers.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,7 @@ void TBasicServicesInitializer::InitializeServices(NActors::TActorSystemSetup* s
870870
GET_FIELD_FROM_CONFIG(MaxSpansInBatch)
871871
GET_FIELD_FROM_CONFIG(MaxBytesInBatch)
872872
GET_FIELD_FROM_CONFIG(SpanExportTimeoutSeconds)
873+
GET_FIELD_FROM_CONFIG(MaxExportRequestsInflight)
873874

874875
#undef GET_FIELD_FROM_CONFIG
875876

ydb/core/protos/config.proto

+2
Original file line numberDiff line numberDiff line change
@@ -1577,6 +1577,8 @@ message TTracingConfig {
15771577
// time after which generated span will be discarded and will
15781578
// not be sent to the collector
15791579
optional uint32 SpanExportTimeoutSeconds = 5;
1580+
// maximum batch export requests being run simultaneously
1581+
optional uint64 MaxExportRequestsInflight = 6;
15801582
}
15811583

15821584
reserved 1 to 5;

ydb/library/actors/wilson/wilson_uploader.cpp

+49-24
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ namespace NWilson {
8484
}
8585
};
8686

87+
struct TExportRequestData : TIntrusiveListItem<TExportRequestData> {
88+
std::unique_ptr<grpc::ClientContext> Context;
89+
std::unique_ptr<grpc::ClientAsyncResponseReader<NServiceProto::ExportTraceServiceResponse>> Reader;
90+
grpc::Status Status;
91+
NServiceProto::ExportTraceServiceResponse Response;
92+
};
93+
8794
class TWilsonUploader
8895
: public TActorBootstrapped<TWilsonUploader>
8996
{
@@ -95,6 +102,7 @@ namespace NWilson {
95102
ui64 MaxBytesInBatch;
96103
TDuration MaxBatchAccumulation = TDuration::Seconds(1);
97104
TDuration MaxSpanTimeInQueue;
105+
ui64 MaxExportInflight;
98106

99107
bool WakeupScheduled = false;
100108

@@ -106,10 +114,6 @@ namespace NWilson {
106114
grpc::CompletionQueue CQ;
107115

108116
std::unique_ptr<IGrpcSigner> GrpcSigner;
109-
std::unique_ptr<grpc::ClientContext> Context;
110-
std::unique_ptr<grpc::ClientAsyncResponseReader<NServiceProto::ExportTraceServiceResponse>> Reader;
111-
NServiceProto::ExportTraceServiceResponse Response;
112-
grpc::Status Status;
113117

114118
TBatch CurrentBatch;
115119
std::queue<TBatch::TData> BatchQueue;
@@ -119,13 +123,17 @@ namespace NWilson {
119123
bool BatchCompletionScheduled = false;
120124
TMonotonic NextBatchCompletion;
121125

126+
TIntrusiveListWithAutoDelete<TExportRequestData, TDelete> ExportRequests;
127+
size_t ExportRequestsCount = 0;
128+
122129
public:
123130
TWilsonUploader(WilsonUploaderParams params)
124131
: MaxSpansPerSecond(params.MaxSpansPerSecond)
125132
, MaxSpansInBatch(params.MaxSpansInBatch)
126133
, MaxBytesInBatch(params.MaxBytesInBatch)
127134
, MaxBatchAccumulation(params.MaxBatchAccumulation)
128135
, MaxSpanTimeInQueue(TDuration::Seconds(params.SpanExportTimeoutSeconds))
136+
, MaxExportInflight(params.MaxExportRequestsInflight)
129137
, CollectorUrl(std::move(params.CollectorUrl))
130138
, ServiceName(std::move(params.ServiceName))
131139
, GrpcSigner(std::move(params.GrpcSigner))
@@ -149,6 +157,10 @@ namespace NWilson {
149157
ALOG_WARN(WILSON_SERVICE_ID, "max_spans_in_batch shold be greater than 0, changing to 1");
150158
MaxSpansInBatch = 1;
151159
}
160+
if (MaxExportInflight == 0) {
161+
ALOG_WARN(WILSON_SERVICE_ID, "max_span_export_inflight should be greater than 0, changing to 1");
162+
MaxExportInflight = 1;
163+
}
152164

153165
TStringBuf scheme;
154166
TStringBuf host;
@@ -243,7 +255,7 @@ namespace NWilson {
243255
"dropped " << numSpansDropped << " span(s) due to expiration");
244256
}
245257

246-
if (Context || BatchQueue.empty()) {
258+
if (ExportRequestsCount >= MaxExportInflight || BatchQueue.empty()) {
247259
return;
248260
} else if (now < NextSendTimestamp) {
249261
ScheduleWakeup(NextSendTimestamp);
@@ -267,29 +279,42 @@ namespace NWilson {
267279
SpansSizeBytes -= batch.SizeBytes;
268280

269281
ScheduleWakeup(NextSendTimestamp);
270-
Context = std::make_unique<grpc::ClientContext>();
282+
283+
auto context = std::make_unique<grpc::ClientContext>();
271284
if (GrpcSigner) {
272-
GrpcSigner->SignClientContext(*Context);
285+
GrpcSigner->SignClientContext(*context);
273286
}
274-
Reader = Stub->AsyncExport(Context.get(), std::move(batch.Request), &CQ);
275-
Reader->Finish(&Response, &Status, nullptr);
287+
auto reader = Stub->AsyncExport(context.get(), std::move(batch.Request), &CQ);
288+
auto uploadData = std::unique_ptr<TExportRequestData>(new TExportRequestData {
289+
.Context = std::move(context),
290+
.Reader = std::move(reader),
291+
});
292+
uploadData->Reader->Finish(&uploadData->Response, &uploadData->Status, uploadData.get());
293+
ALOG_TRACE(WILSON_SERVICE_ID, "started export request " << (void*)uploadData.get());
294+
ExportRequests.PushBack(uploadData.release());
295+
++ExportRequestsCount;
276296
}
277297

278-
void CheckIfDone() {
279-
if (Context) {
280-
void *tag;
281-
bool ok;
282-
if (CQ.AsyncNext(&tag, &ok, std::chrono::system_clock::now()) == grpc::CompletionQueue::GOT_EVENT) {
283-
if (!Status.ok()) {
284-
ALOG_ERROR(WILSON_SERVICE_ID,
285-
"failed to commit traces: " << Status.error_message());
286-
}
287-
288-
Reader.reset();
289-
Context.reset();
290-
} else {
291-
ScheduleWakeup(TDuration::MilliSeconds(100));
298+
void ReapCompletedRequests() {
299+
if (ExportRequests.Empty()) {
300+
return;
301+
}
302+
void* tag;
303+
bool ok;
304+
while (CQ.AsyncNext(&tag, &ok, std::chrono::system_clock::now()) == grpc::CompletionQueue::GOT_EVENT) {
305+
auto node = std::unique_ptr<TExportRequestData>(static_cast<TExportRequestData*>(tag));
306+
ALOG_TRACE(WILSON_SERVICE_ID, "finished export request " << (void*)node.get());
307+
if (!node->Status.ok()) {
308+
ALOG_ERROR(WILSON_SERVICE_ID,
309+
"failed to commit traces: " << node->Status.error_message());
292310
}
311+
312+
--ExportRequestsCount;
313+
node->Unlink();
314+
}
315+
316+
if (!ExportRequests.Empty()) {
317+
ScheduleWakeup(TDuration::MilliSeconds(100));
293318
}
294319
}
295320

@@ -322,7 +347,7 @@ namespace NWilson {
322347
}
323348

324349
void TryMakeProgress() {
325-
CheckIfDone();
350+
ReapCompletedRequests();
326351
TryToSend();
327352
}
328353

ydb/library/actors/wilson/wilson_uploader.h

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ namespace NWilson {
3636
ui64 MaxBatchAccumulationMilliseconds = 1'000;
3737
TDuration MaxBatchAccumulation = TDuration::Seconds(1);
3838
ui32 SpanExportTimeoutSeconds = 60 * 60 * 24 * 365;
39+
ui64 MaxExportRequestsInflight = 1;
3940

4041
NActors::IActor* CreateUploader() &&;
4142
};

ydb/tools/cfg/static.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1208,7 +1208,11 @@ def get_uploader(uploader):
12081208

12091209
span_export_timeout_seconds = uploader.get("span_export_timeout_seconds")
12101210
if span_export_timeout_seconds is not None:
1211-
uploader_pb.SpanTtlSeconds = span_export_timeout_seconds
1211+
uploader_pb.SpanExportTimeoutSeconds = span_export_timeout_seconds
1212+
1213+
max_export_requests_inflight = uploader.get("max_export_requests_inflight")
1214+
if max_export_requests_inflight is not None:
1215+
uploader_pb.MaxExportRequestsInflight = max_export_requests_inflight
12121216

12131217
return uploader_pb
12141218

ydb/tools/cfg/validation.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -176,9 +176,11 @@
176176
max_spans_per_second=dict(type="integer", minimum=1),
177177
max_spans_in_batch=dict(type="integer", minimum=1),
178178
max_bytes_in_batch=dict(type="integer"),
179-
max_bytes_accumulation_milliseconds=dict(type="integer"),
179+
max_batch_accumulation_milliseconds=dict(type="integer"),
180180
span_export_timeout_seconds=dict(type="integer", minimum=1),
181+
max_export_requests_inflight=dict(type="integer", minimum=1),
181182
),
183+
additionalProperties=False,
182184
),
183185
sampling=dict(
184186
type="array",

0 commit comments

Comments
 (0)