Skip to content

Commit f3e0968

Browse files
authored
generic lookup: use retry_policy library (#13460)
1 parent 9d32e6d commit f3e0968

File tree

2 files changed

+45
-38
lines changed

2 files changed

+45
-38
lines changed

ydb/library/yql/providers/generic/actors/yql_generic_base_actor.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -90,15 +90,6 @@ namespace NYql::NDq {
9090
NConnector::NApi::TError Error;
9191
};
9292

93-
struct TEvRetry: NActors::TEventLocal<TEvRetry, EvRetry> {
94-
explicit TEvRetry(ui32 nextRetries)
95-
: NextRetries(nextRetries)
96-
{
97-
}
98-
99-
ui32 NextRetries;
100-
};
101-
10293
protected: // TODO move common logic here
10394
};
10495

ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp

Lines changed: 45 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include <yql/essentials/utils/yql_panic.h>
2323
#include <ydb/core/formats/arrow/serializer/abstract.h>
2424

25+
#include <library/cpp/retry/retry_policy.h>
26+
2527
namespace NYql::NDq {
2628

2729
using namespace NActors;
@@ -60,6 +62,12 @@ namespace NYql::NDq {
6062
public TGenericBaseActor<TGenericLookupActor> {
6163
using TBase = TGenericBaseActor<TGenericLookupActor>;
6264

65+
using ILookupRetryPolicy = IRetryPolicy<const NYdbGrpc::TGrpcStatus&>;
66+
using ILookupRetryState = ILookupRetryPolicy::IRetryState;
67+
68+
struct TEvLookupRetry : NActors::TEventLocal<TEvLookupRetry, EvRetry> {
69+
};
70+
6371
public:
6472
TGenericLookupActor(
6573
NConnector::IClient::TPtr connectorClient,
@@ -86,6 +94,24 @@ namespace NYql::NDq {
8694
, HolderFactory(holderFactory)
8795
, ColumnDestinations(CreateColumnDestination())
8896
, MaxKeysInRequest(maxKeysInRequest)
97+
, RetryPolicy(
98+
ILookupRetryPolicy::GetExponentialBackoffPolicy(
99+
/* retryClassFunction */
100+
[](const NYdbGrpc::TGrpcStatus& status) {
101+
if (NConnector::GrpcStatusNeedsRetry(status)) {
102+
return ERetryErrorClass::ShortRetry;
103+
}
104+
if (status.GRpcStatusCode == grpc::DEADLINE_EXCEEDED) {
105+
return ERetryErrorClass::ShortRetry; // TODO LongRetry?
106+
}
107+
return ERetryErrorClass::NoRetry;
108+
},
109+
/* minDelay */ TDuration::MilliSeconds(1),
110+
/* minLongRetryDelay */ TDuration::MilliSeconds(500),
111+
/* maxDelay */ TDuration::Seconds(1),
112+
/* maxRetries */ RequestRetriesLimit,
113+
/* maxTime */ TDuration::Minutes(5),
114+
/* scaleFactor */ 2))
89115
{
90116
InitMonCounters(taskCounters);
91117
}
@@ -156,7 +182,7 @@ namespace NYql::NDq {
156182
hFunc(TEvReadSplitsPart, Handle);
157183
hFunc(TEvReadSplitsFinished, Handle);
158184
hFunc(TEvError, Handle);
159-
hFunc(TEvRetry, Handle);
185+
hFunc(TEvLookupRetry, Handle);
160186
hFunc(NActors::TEvents::TEvPoison, Handle);)
161187

162188
void Handle(TEvListSplitsIterator::TPtr ev) {
@@ -165,7 +191,7 @@ namespace NYql::NDq {
165191
[
166192
actorSystem = TActivationContext::ActorSystem(),
167193
selfId = SelfId(),
168-
retriesRemaining = RetriesRemaining
194+
retryState = RetryState
169195
](const NConnector::TAsyncResult<NConnector::NApi::TListSplitsResponse>& asyncResult) {
170196
YQL_CLOG(DEBUG, ProviderGeneric) << "ActorId=" << selfId << " Got TListSplitsResponse from Connector";
171197
auto result = ExtractFromConstFuture(asyncResult);
@@ -174,7 +200,7 @@ namespace NYql::NDq {
174200
auto ev = new TEvListSplitsPart(std::move(*result.Response));
175201
actorSystem->Send(new NActors::IEventHandle(selfId, selfId, ev));
176202
} else {
177-
SendRetryOrError(actorSystem, selfId, result.Status, retriesRemaining);
203+
SendRetryOrError(actorSystem, selfId, result.Status, retryState);
178204
}
179205
});
180206
}
@@ -198,15 +224,15 @@ namespace NYql::NDq {
198224
Connector->ReadSplits(readRequest, RequestTimeout).Subscribe([
199225
actorSystem = TActivationContext::ActorSystem(),
200226
selfId = SelfId(),
201-
retriesRemaining = RetriesRemaining
227+
retryState = RetryState
202228
](const NConnector::TReadSplitsStreamIteratorAsyncResult& asyncResult) {
203229
YQL_CLOG(DEBUG, ProviderGeneric) << "ActorId=" << selfId << " Got ReadSplitsStreamIterator from Connector";
204230
auto result = ExtractFromConstFuture(asyncResult);
205231
if (result.Status.Ok()) {
206232
auto ev = new TEvReadSplitsIterator(std::move(result.Iterator));
207233
actorSystem->Send(new NActors::IEventHandle(selfId, selfId, ev));
208234
} else {
209-
SendRetryOrError(actorSystem, selfId, result.Status, retriesRemaining);
235+
SendRetryOrError(actorSystem, selfId, result.Status, retryState);
210236
}
211237
});
212238
}
@@ -235,9 +261,8 @@ namespace NYql::NDq {
235261
actorSystem->Send(new NActors::IEventHandle(ParentId, SelfId(), errEv.release()));
236262
}
237263

238-
void Handle(TEvRetry::TPtr ev) {
264+
void Handle(TEvLookupRetry::TPtr) {
239265
auto guard = Guard(*Alloc);
240-
RetriesRemaining = ev->Get()->NextRetries;
241266
SendRequest();
242267
}
243268

@@ -269,7 +294,7 @@ namespace NYql::NDq {
269294
}
270295

271296
Request = std::move(request);
272-
RetriesRemaining = RequestRetriesLimit;
297+
RetryState = std::shared_ptr<ILookupRetryState>(RetryPolicy->CreateRetryState());
273298
SendRequest();
274299
}
275300

@@ -287,7 +312,7 @@ namespace NYql::NDq {
287312
Connector->ListSplits(splitRequest, RequestTimeout).Subscribe([
288313
actorSystem = TActivationContext::ActorSystem(),
289314
selfId = SelfId(),
290-
retriesRemaining = RetriesRemaining
315+
retryState = RetryState
291316
](const NConnector::TListSplitsStreamIteratorAsyncResult& asyncResult) {
292317
auto result = ExtractFromConstFuture(asyncResult);
293318
if (result.Status.Ok()) {
@@ -296,7 +321,7 @@ namespace NYql::NDq {
296321
auto ev = new TEvListSplitsIterator(std::move(result.Iterator));
297322
actorSystem->Send(new NActors::IEventHandle(selfId, selfId, ev));
298323
} else {
299-
SendRetryOrError(actorSystem, selfId, result.Status, retriesRemaining);
324+
SendRetryOrError(actorSystem, selfId, result.Status, retryState);
300325
}
301326
});
302327
if (CpuTime) {
@@ -309,7 +334,7 @@ namespace NYql::NDq {
309334
[
310335
actorSystem = TActivationContext::ActorSystem(),
311336
selfId = SelfId(),
312-
retriesRemaining = RetriesRemaining
337+
retryState = RetryState
313338
](const NConnector::TAsyncResult<NConnector::NApi::TReadSplitsResponse>& asyncResult) {
314339
auto result = ExtractFromConstFuture(asyncResult);
315340
if (result.Status.Ok()) {
@@ -328,7 +353,7 @@ namespace NYql::NDq {
328353
auto ev = new TEvReadSplitsFinished(std::move(result.Status));
329354
actorSystem->Send(new NActors::IEventHandle(selfId, selfId, ev));
330355
} else {
331-
SendRetryOrError(actorSystem, selfId, result.Status, retriesRemaining);
356+
SendRetryOrError(actorSystem, selfId, result.Status, retryState);
332357
}
333358
});
334359
}
@@ -394,22 +419,12 @@ namespace NYql::NDq {
394419
new TEvError(std::move(error)));
395420
}
396421

397-
static void SendRetryOrError(NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, const NYdbGrpc::TGrpcStatus& status, ui32 retriesRemaining) {
398-
if (NConnector::GrpcStatusNeedsRetry(status) || status.GRpcStatusCode == grpc::DEADLINE_EXCEEDED) {
399-
if (retriesRemaining) {
400-
const auto retry = RequestRetriesLimit - retriesRemaining;
401-
const auto delay = TDuration::MilliSeconds(1u << retry); // Exponential delay from 1ms to ~0.5s
402-
// << TODO tune/tweak
403-
YQL_CLOG(WARN, ProviderGeneric) << "ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString() << ", retry " << (retry + 1) << " of " << RequestRetriesLimit << ", scheduled in " << delay;
404-
--retriesRemaining;
405-
if (status.GRpcStatusCode == grpc::DEADLINE_EXCEEDED) {
406-
// if error was deadline, retry only once
407-
retriesRemaining = 0; // TODO tune/tweak
408-
}
409-
actorSystem->Schedule(delay, new IEventHandle(selfId, selfId, new TEvRetry(retriesRemaining)));
410-
return;
411-
}
412-
YQL_CLOG(ERROR, ProviderGeneric) << "ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString() << ", retry count exceed limit " << RequestRetriesLimit;
422+
static void SendRetryOrError(NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, const NYdbGrpc::TGrpcStatus& status, std::shared_ptr<ILookupRetryState> retryState) {
423+
auto nextRetry = retryState->GetNextRetryDelay(status);
424+
if (nextRetry) {
425+
YQL_CLOG(WARN, ProviderGeneric) << "ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString() << ", retry scheduled in " << *nextRetry;
426+
actorSystem->Schedule(*nextRetry, new IEventHandle(selfId, selfId, new TEvLookupRetry()));
427+
return;
413428
}
414429
SendError(actorSystem, selfId, NConnector::ErrorFromGRPCStatus(status));
415430
}
@@ -509,7 +524,8 @@ namespace NYql::NDq {
509524
std::shared_ptr<IDqAsyncLookupSource::TUnboxedValueMap> Request;
510525
NConnector::IReadSplitsStreamIterator::TPtr ReadSplitsIterator; // TODO move me to TEvReadSplitsPart
511526
NKikimr::NMiniKQL::TKeyPayloadPairVector LookupResult;
512-
ui32 RetriesRemaining;
527+
ILookupRetryPolicy::TPtr RetryPolicy;
528+
std::shared_ptr<ILookupRetryState> RetryState;
513529
::NMonitoring::TDynamicCounters::TCounterPtr Count;
514530
::NMonitoring::TDynamicCounters::TCounterPtr Keys;
515531
::NMonitoring::TDynamicCounters::TCounterPtr ResultRows;

0 commit comments

Comments
 (0)