Skip to content

Commit ac1ae9a

Browse files
authored
generic lookup: use retry_policy library (backport #13460) (#14956)
1 parent 95333f6 commit ac1ae9a

File tree

2 files changed

+45
-38
lines changed

2 files changed

+45
-38
lines changed

ydb/library/yql/providers/generic/actors/yql_generic_base_actor.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -90,15 +90,6 @@ namespace NYql::NDq {
9090
NConnector::NApi::TError Error;
9191
};
9292

93-
struct TEvRetry: NActors::TEventLocal<TEvRetry, EvRetry> {
94-
explicit TEvRetry(ui32 nextRetries)
95-
: NextRetries(nextRetries)
96-
{
97-
}
98-
99-
ui32 NextRetries;
100-
};
101-
10293
protected: // TODO move common logic here
10394
};
10495

ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp

Lines changed: 45 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include <yql/essentials/utils/yql_panic.h>
2424
#include <ydb/core/formats/arrow/serializer/abstract.h>
2525

26+
#include <library/cpp/retry/retry_policy.h>
27+
2628
namespace NYql::NDq {
2729

2830
using namespace NActors;
@@ -61,6 +63,12 @@ namespace NYql::NDq {
6163
public TGenericBaseActor<TGenericLookupActor> {
6264
using TBase = TGenericBaseActor<TGenericLookupActor>;
6365

66+
using ILookupRetryPolicy = IRetryPolicy<const NYdbGrpc::TGrpcStatus&>;
67+
using ILookupRetryState = ILookupRetryPolicy::IRetryState;
68+
69+
struct TEvLookupRetry : NActors::TEventLocal<TEvLookupRetry, EvRetry> {
70+
};
71+
6472
public:
6573
TGenericLookupActor(
6674
NConnector::IClient::TPtr connectorClient,
@@ -87,6 +95,24 @@ namespace NYql::NDq {
8795
, HolderFactory(holderFactory)
8896
, ColumnDestinations(CreateColumnDestination())
8997
, MaxKeysInRequest(maxKeysInRequest)
98+
, RetryPolicy(
99+
ILookupRetryPolicy::GetExponentialBackoffPolicy(
100+
/* retryClassFunction */
101+
[](const NYdbGrpc::TGrpcStatus& status) {
102+
if (NConnector::GrpcStatusNeedsRetry(status)) {
103+
return ERetryErrorClass::ShortRetry;
104+
}
105+
if (status.GRpcStatusCode == grpc::DEADLINE_EXCEEDED) {
106+
return ERetryErrorClass::ShortRetry; // TODO LongRetry?
107+
}
108+
return ERetryErrorClass::NoRetry;
109+
},
110+
/* minDelay */ TDuration::MilliSeconds(1),
111+
/* minLongRetryDelay */ TDuration::MilliSeconds(500),
112+
/* maxDelay */ TDuration::Seconds(1),
113+
/* maxRetries */ RequestRetriesLimit,
114+
/* maxTime */ TDuration::Minutes(5),
115+
/* scaleFactor */ 2))
90116
{
91117
InitMonCounters(taskCounters);
92118
}
@@ -157,7 +183,7 @@ namespace NYql::NDq {
157183
hFunc(TEvReadSplitsPart, Handle);
158184
hFunc(TEvReadSplitsFinished, Handle);
159185
hFunc(TEvError, Handle);
160-
hFunc(TEvRetry, Handle);
186+
hFunc(TEvLookupRetry, Handle);
161187
hFunc(NActors::TEvents::TEvPoison, Handle);)
162188

163189
void Handle(TEvListSplitsIterator::TPtr ev) {
@@ -166,7 +192,7 @@ namespace NYql::NDq {
166192
[
167193
actorSystem = TActivationContext::ActorSystem(),
168194
selfId = SelfId(),
169-
retriesRemaining = RetriesRemaining
195+
retryState = RetryState
170196
](const NConnector::TAsyncResult<NConnector::NApi::TListSplitsResponse>& asyncResult) {
171197
YQL_CLOG(DEBUG, ProviderGeneric) << "ActorId=" << selfId << " Got TListSplitsResponse from Connector";
172198
auto result = ExtractFromConstFuture(asyncResult);
@@ -175,7 +201,7 @@ namespace NYql::NDq {
175201
auto ev = new TEvListSplitsPart(std::move(*result.Response));
176202
actorSystem->Send(new NActors::IEventHandle(selfId, selfId, ev));
177203
} else {
178-
SendRetryOrError(actorSystem, selfId, result.Status, retriesRemaining);
204+
SendRetryOrError(actorSystem, selfId, result.Status, retryState);
179205
}
180206
});
181207
}
@@ -199,15 +225,15 @@ namespace NYql::NDq {
199225
Connector->ReadSplits(readRequest, RequestTimeout).Subscribe([
200226
actorSystem = TActivationContext::ActorSystem(),
201227
selfId = SelfId(),
202-
retriesRemaining = RetriesRemaining
228+
retryState = RetryState
203229
](const NConnector::TReadSplitsStreamIteratorAsyncResult& asyncResult) {
204230
YQL_CLOG(DEBUG, ProviderGeneric) << "ActorId=" << selfId << " Got ReadSplitsStreamIterator from Connector";
205231
auto result = ExtractFromConstFuture(asyncResult);
206232
if (result.Status.Ok()) {
207233
auto ev = new TEvReadSplitsIterator(std::move(result.Iterator));
208234
actorSystem->Send(new NActors::IEventHandle(selfId, selfId, ev));
209235
} else {
210-
SendRetryOrError(actorSystem, selfId, result.Status, retriesRemaining);
236+
SendRetryOrError(actorSystem, selfId, result.Status, retryState);
211237
}
212238
});
213239
}
@@ -236,9 +262,8 @@ namespace NYql::NDq {
236262
actorSystem->Send(new NActors::IEventHandle(ParentId, SelfId(), errEv.release()));
237263
}
238264

239-
void Handle(TEvRetry::TPtr ev) {
265+
void Handle(TEvLookupRetry::TPtr) {
240266
auto guard = Guard(*Alloc);
241-
RetriesRemaining = ev->Get()->NextRetries;
242267
SendRequest();
243268
}
244269

@@ -270,7 +295,7 @@ namespace NYql::NDq {
270295
}
271296

272297
Request = std::move(request);
273-
RetriesRemaining = RequestRetriesLimit;
298+
RetryState = std::shared_ptr<ILookupRetryState>(RetryPolicy->CreateRetryState());
274299
SendRequest();
275300
}
276301

@@ -288,7 +313,7 @@ namespace NYql::NDq {
288313
Connector->ListSplits(splitRequest, RequestTimeout).Subscribe([
289314
actorSystem = TActivationContext::ActorSystem(),
290315
selfId = SelfId(),
291-
retriesRemaining = RetriesRemaining
316+
retryState = RetryState
292317
](const NConnector::TListSplitsStreamIteratorAsyncResult& asyncResult) {
293318
auto result = ExtractFromConstFuture(asyncResult);
294319
if (result.Status.Ok()) {
@@ -297,7 +322,7 @@ namespace NYql::NDq {
297322
auto ev = new TEvListSplitsIterator(std::move(result.Iterator));
298323
actorSystem->Send(new NActors::IEventHandle(selfId, selfId, ev));
299324
} else {
300-
SendRetryOrError(actorSystem, selfId, result.Status, retriesRemaining);
325+
SendRetryOrError(actorSystem, selfId, result.Status, retryState);
301326
}
302327
});
303328
if (CpuTime) {
@@ -310,7 +335,7 @@ namespace NYql::NDq {
310335
[
311336
actorSystem = TActivationContext::ActorSystem(),
312337
selfId = SelfId(),
313-
retriesRemaining = RetriesRemaining
338+
retryState = RetryState
314339
](const NConnector::TAsyncResult<NConnector::NApi::TReadSplitsResponse>& asyncResult) {
315340
auto result = ExtractFromConstFuture(asyncResult);
316341
if (result.Status.Ok()) {
@@ -329,7 +354,7 @@ namespace NYql::NDq {
329354
auto ev = new TEvReadSplitsFinished(std::move(result.Status));
330355
actorSystem->Send(new NActors::IEventHandle(selfId, selfId, ev));
331356
} else {
332-
SendRetryOrError(actorSystem, selfId, result.Status, retriesRemaining);
357+
SendRetryOrError(actorSystem, selfId, result.Status, retryState);
333358
}
334359
});
335360
}
@@ -395,22 +420,12 @@ namespace NYql::NDq {
395420
new TEvError(std::move(error)));
396421
}
397422

398-
static void SendRetryOrError(NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, const NYdbGrpc::TGrpcStatus& status, ui32 retriesRemaining) {
399-
if (NConnector::GrpcStatusNeedsRetry(status) || status.GRpcStatusCode == grpc::DEADLINE_EXCEEDED) {
400-
if (retriesRemaining) {
401-
const auto retry = RequestRetriesLimit - retriesRemaining;
402-
const auto delay = TDuration::MilliSeconds(1u << retry); // Exponential delay from 1ms to ~0.5s
403-
// << TODO tune/tweak
404-
YQL_CLOG(WARN, ProviderGeneric) << "ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString() << ", retry " << (retry + 1) << " of " << RequestRetriesLimit << ", scheduled in " << delay;
405-
--retriesRemaining;
406-
if (status.GRpcStatusCode == grpc::DEADLINE_EXCEEDED) {
407-
// if error was deadline, retry only once
408-
retriesRemaining = 0; // TODO tune/tweak
409-
}
410-
actorSystem->Schedule(delay, new IEventHandle(selfId, selfId, new TEvRetry(retriesRemaining)));
411-
return;
412-
}
413-
YQL_CLOG(ERROR, ProviderGeneric) << "ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString() << ", retry count exceed limit " << RequestRetriesLimit;
423+
static void SendRetryOrError(NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, const NYdbGrpc::TGrpcStatus& status, std::shared_ptr<ILookupRetryState> retryState) {
424+
auto nextRetry = retryState->GetNextRetryDelay(status);
425+
if (nextRetry) {
426+
YQL_CLOG(WARN, ProviderGeneric) << "ActorId=" << selfId << " Got retrievable GRPC Error from Connector: " << status.ToDebugString() << ", retry scheduled in " << *nextRetry;
427+
actorSystem->Schedule(*nextRetry, new IEventHandle(selfId, selfId, new TEvLookupRetry()));
428+
return;
414429
}
415430
SendError(actorSystem, selfId, NConnector::ErrorFromGRPCStatus(status));
416431
}
@@ -510,7 +525,8 @@ namespace NYql::NDq {
510525
std::shared_ptr<IDqAsyncLookupSource::TUnboxedValueMap> Request;
511526
NConnector::IReadSplitsStreamIterator::TPtr ReadSplitsIterator; // TODO move me to TEvReadSplitsPart
512527
NKikimr::NMiniKQL::TKeyPayloadPairVector LookupResult;
513-
ui32 RetriesRemaining;
528+
ILookupRetryPolicy::TPtr RetryPolicy;
529+
std::shared_ptr<ILookupRetryState> RetryState;
514530
::NMonitoring::TDynamicCounters::TCounterPtr Count;
515531
::NMonitoring::TDynamicCounters::TCounterPtr Keys;
516532
::NMonitoring::TDynamicCounters::TCounterPtr ResultRows;

0 commit comments

Comments
 (0)