@@ -118,13 +118,15 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
118
118
enum class EReadState {
119
119
Initial,
120
120
Running,
121
+ Blocked, // Read can't accept new data, but not finished yet
121
122
Finished,
122
123
};
123
124
124
125
std::string_view ReadStateToString (EReadState state) {
125
126
switch (state) {
126
127
case EReadState::Initial: return " Initial" sv;
127
128
case EReadState::Running: return " Running" sv;
129
+ case EReadState::Blocked: return " Blocked" sv;
128
130
case EReadState::Finished: return " Finished" sv;
129
131
}
130
132
}
@@ -143,6 +145,10 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
143
145
return (State == EReadState::Finished);
144
146
}
145
147
148
+ void SetBlocked () {
149
+ State = EReadState::Blocked;
150
+ }
151
+
146
152
const ui64 Id;
147
153
const ui64 ShardId;
148
154
EReadState State;
@@ -277,6 +283,7 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
277
283
}
278
284
279
285
void Handle (TEvTxProxySchemeCache::TEvResolveKeySetResult::TPtr& ev) {
286
+ ResoleShardsInProgress = false ;
280
287
CA_LOG_D (" TEvResolveKeySetResult was received for table: " << StreamLookupWorker->GetTablePath ());
281
288
if (ev->Get ()->Request ->ErrorCount > 0 ) {
282
289
TString errorMsg = TStringBuilder () << " Failed to get partitioning for table: "
@@ -301,15 +308,16 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
301
308
302
309
auto readIt = Reads.find (record.GetReadId ());
303
310
if (readIt == Reads.end () || readIt->second .State != EReadState::Running) {
304
- CA_LOG_D (" Drop read with readId: " << record.GetReadId () << " , because it's already completed" );
311
+ CA_LOG_D (" Drop read with readId: " << record.GetReadId () << " , because it's already completed or blocked " );
305
312
return ;
306
313
}
307
314
308
315
auto & read = readIt->second ;
309
316
310
317
CA_LOG_D (" Recv TEvReadResult (stream lookup) from ShardID=" << read .ShardId
311
318
<< " , Table = " << StreamLookupWorker->GetTablePath ()
312
- << " , ReadId=" << record.GetReadId ()
319
+ << " , ReadId=" << record.GetReadId () << " (current ReadId=" << ReadId << " )"
320
+ << " , SeqNo=" << record.GetSeqNo ()
313
321
<< " , Status=" << Ydb::StatusIds::StatusCode_Name (record.GetStatus ().GetCode ())
314
322
<< " , Finished=" << record.GetFinished ()
315
323
<< " , RowCount=" << record.GetRowCount ()
@@ -345,27 +353,55 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
345
353
Counters->DataShardIteratorFails ->Inc ();
346
354
}
347
355
356
+ auto getIssues = [&record]() {
357
+ NYql::TIssues issues;
358
+ NYql::IssuesFromMessage (record.GetStatus ().GetIssues (), issues);
359
+ return issues;
360
+ };
361
+
362
+ auto replyError = [&](auto message, auto status) {
363
+ return RuntimeError (message, status, getIssues ());
364
+ };
365
+
348
366
switch (record.GetStatus ().GetCode ()) {
349
367
case Ydb::StatusIds::SUCCESS:
350
368
break ;
351
- case Ydb::StatusIds::NOT_FOUND: {
369
+ case Ydb::StatusIds::NOT_FOUND:
370
+ {
352
371
StreamLookupWorker->ResetRowsProcessing (read .Id , read .FirstUnprocessedQuery , read .LastProcessedKey );
353
372
read .SetFinished ();
373
+ CA_LOG_D (" NOT_FOUND was received from tablet: " << read .ShardId << " . "
374
+ << getIssues ().ToOneLineString ());
354
375
return ResolveTableShards ();
355
376
}
356
377
case Ydb::StatusIds::OVERLOADED: {
378
+ if (CheckTotalRetriesExeeded () || CheckShardRetriesExeeded (read )) {
379
+ return replyError (
380
+ TStringBuilder () << " Table '" << StreamLookupWorker->GetTablePath () << " ' retry limit exceeded." ,
381
+ NYql::NDqProto::StatusIds::OVERLOADED);
382
+ }
383
+ CA_LOG_D (" OVERLOADED was received from tablet: " << read .ShardId << " ."
384
+ << getIssues ().ToOneLineString ());
385
+ read .SetBlocked ();
357
386
return RetryTableRead (read , /* allowInstantRetry = */ false );
358
387
}
359
388
case Ydb::StatusIds::INTERNAL_ERROR: {
389
+ if (CheckTotalRetriesExeeded () || CheckShardRetriesExeeded (read )) {
390
+ return replyError (
391
+ TStringBuilder () << " Table '" << StreamLookupWorker->GetTablePath () << " ' retry limit exceeded." ,
392
+ NYql::NDqProto::StatusIds::INTERNAL_ERROR);
393
+ }
394
+ CA_LOG_D (" INTERNAL_ERROR was received from tablet: " << read .ShardId << " ."
395
+ << getIssues ().ToOneLineString ());
396
+ read .SetBlocked ();
360
397
return RetryTableRead (read );
361
398
}
362
399
default : {
363
- NYql::TIssues issues;
364
- NYql::IssuesFromMessage (record.GetStatus ().GetIssues (), issues);
365
- return RuntimeError (" Read request aborted" , NYql::NDqProto::StatusIds::ABORTED, issues);
400
+ return replyError (" Read request aborted" , NYql::NDqProto::StatusIds::ABORTED);
366
401
}
367
402
}
368
403
404
+ YQL_ENSURE (read .LastSeqNo < record.GetSeqNo ());
369
405
read .LastSeqNo = record.GetSeqNo ();
370
406
371
407
if (record.GetFinished ()) {
@@ -380,6 +416,8 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
380
416
if (continuationToken.HasLastProcessedKey ()) {
381
417
TSerializedCellVec lastKey (continuationToken.GetLastProcessedKey ());
382
418
read .LastProcessedKey = TOwnedCellVec (lastKey.GetCells ());
419
+ } else {
420
+ read .LastProcessedKey .Clear ();
383
421
}
384
422
385
423
Counters->SentIteratorAcks ->Inc ();
@@ -425,6 +463,7 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
425
463
}
426
464
}
427
465
for (auto * read : toRetry) {
466
+ read ->SetBlocked ();
428
467
RetryTableRead (*read );
429
468
}
430
469
}
@@ -436,6 +475,7 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
436
475
if (!Partitioning) {
437
476
LookupActorStateSpan.EndError (" timeout exceeded" );
438
477
CA_LOG_D (" Retry attempt to resolve shards for table: " << StreamLookupWorker->GetTablePath ());
478
+ ResoleShardsInProgress = false ;
439
479
ResolveTableShards ();
440
480
}
441
481
}
@@ -445,7 +485,9 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
445
485
YQL_ENSURE (readIt != Reads.end (), " Unexpected readId: " << ev->Get ()->ReadId );
446
486
auto & read = readIt->second ;
447
487
448
- if (read .State == EReadState::Running && read .LastSeqNo <= ev->Get ()->LastSeqNo ) {
488
+ YQL_ENSURE (read .State != EReadState::Blocked || read .LastSeqNo <= ev->Get ()->LastSeqNo );
489
+
490
+ if ((read .State == EReadState::Running && read .LastSeqNo <= ev->Get ()->LastSeqNo ) || read .State == EReadState::Blocked) {
449
491
if (ev->Get ()->InstantStart ) {
450
492
read .SetFinished ();
451
493
auto requests = StreamLookupWorker->RebuildRequest (read .Id , read .FirstUnprocessedQuery , read .LastProcessedKey , ReadId);
@@ -538,24 +580,33 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
538
580
}
539
581
}
540
582
583
+ bool CheckTotalRetriesExeeded () {
584
+ const auto limit = MaxTotalRetries ();
585
+ return limit && TotalRetryAttempts + 1 > *limit;
586
+ }
587
+
588
+ bool CheckShardRetriesExeeded (TReadState& failedRead) {
589
+ const auto & shardState = ReadsPerShard[failedRead.ShardId ];
590
+ return shardState.RetryAttempts + 1 > MaxShardRetries ();
591
+ }
592
+
541
593
void RetryTableRead (TReadState& failedRead, bool allowInstantRetry = true ) {
542
594
CA_LOG_D (" Retry reading of table: " << StreamLookupWorker->GetTablePath () << " , readId: " << failedRead.Id
543
595
<< " , shardId: " << failedRead.ShardId );
544
596
545
- ++TotalRetryAttempts;
546
- auto totalRetriesLimit = MaxTotalRetries ();
547
- if (totalRetriesLimit && TotalRetryAttempts > *totalRetriesLimit) {
597
+ if (CheckTotalRetriesExeeded ()) {
548
598
return RuntimeError (TStringBuilder () << " Table '" << StreamLookupWorker->GetTablePath () << " ' retry limit exceeded" ,
549
599
NYql::NDqProto::StatusIds::UNAVAILABLE);
550
600
}
601
+ ++TotalRetryAttempts;
551
602
552
- auto & shardState = ReadsPerShard[failedRead.ShardId ];
553
- ++shardState.RetryAttempts ;
554
- if (shardState.RetryAttempts > MaxShardRetries ()) {
603
+ if (CheckShardRetriesExeeded (failedRead)) {
555
604
StreamLookupWorker->ResetRowsProcessing (failedRead.Id , failedRead.FirstUnprocessedQuery , failedRead.LastProcessedKey );
556
605
failedRead.SetFinished ();
557
606
return ResolveTableShards ();
558
607
}
608
+ auto & shardState = ReadsPerShard[failedRead.ShardId ];
609
+ ++shardState.RetryAttempts ;
559
610
560
611
auto delay = CalcDelay (shardState.RetryAttempts , allowInstantRetry);
561
612
if (delay == TDuration::Zero ()) {
@@ -573,12 +624,17 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
573
624
}
574
625
575
626
void ResolveTableShards () {
627
+ if (ResoleShardsInProgress) {
628
+ return ;
629
+ }
630
+
576
631
if (++TotalResolveShardsAttempts > MaxShardResolves ()) {
577
632
return RuntimeError (TStringBuilder () << " Table '" << StreamLookupWorker->GetTablePath () << " ' resolve attempts limit exceeded" ,
578
633
NYql::NDqProto::StatusIds::UNAVAILABLE);
579
634
}
580
635
581
636
CA_LOG_D (" Resolve shards for table: " << StreamLookupWorker->GetTablePath ());
637
+ ResoleShardsInProgress = true ;
582
638
583
639
Partitioning.reset ();
584
640
@@ -658,6 +714,7 @@ class TKqpStreamLookupActor : public NActors::TActorBootstrapped<TKqpStreamLooku
658
714
ui64 ReadId = 0 ;
659
715
size_t TotalRetryAttempts = 0 ;
660
716
size_t TotalResolveShardsAttempts = 0 ;
717
+ bool ResoleShardsInProgress = false ;
661
718
662
719
// stats
663
720
ui64 ReadRowsCount = 0 ;
0 commit comments