@@ -60,15 +60,8 @@ NKqpNode::TState& GetStateBucketByTx(std::shared_ptr<TBucketArray> buckets, ui64
60
60
}
61
61
62
62
void FinishKqpTask (ui64 txId, ui64 taskId, bool success, NKqpNode::TState& bucket, std::shared_ptr<NRm::IKqpResourceManager> ResourceManager) {
63
- auto ctx = bucket.RemoveTask (txId, taskId, success);
64
- if (ctx) {
65
- ResourceManager->FreeExecutionUnits (1 );
66
- if (ctx->ComputeActorsNumber == 0 ) {
67
- ResourceManager->FreeResources (txId);
68
- } else {
69
- ResourceManager->FreeResources (txId, taskId);
70
- }
71
- }
63
+ bucket.RemoveTask (txId, taskId, success);
64
+ ResourceManager->FreeResources (txId, taskId);
72
65
}
73
66
74
67
struct TMemoryQuotaManager : public NYql ::NDq::TGuaranteeQuotaManager {
@@ -86,7 +79,8 @@ struct TMemoryQuotaManager : public NYql::NDq::TGuaranteeQuotaManager {
86
79
, Buckets(std::move(buckets))
87
80
, TxId(txId)
88
81
, TaskId(taskId)
89
- , InstantAlloc(instantAlloc) {
82
+ , InstantAlloc(instantAlloc)
83
+ {
90
84
}
91
85
92
86
~TMemoryQuotaManager () override {
@@ -100,8 +94,10 @@ struct TMemoryQuotaManager : public NYql::NDq::TGuaranteeQuotaManager {
100
94
return false ;
101
95
}
102
96
103
- if (!ResourceManager->AllocateResources (TxId, TaskId,
104
- NRm::TKqpResourcesRequest{.MemoryPool = MemoryPool, .Memory = extraSize})) {
97
+ auto result = ResourceManager->AllocateResources (TxId, TaskId,
98
+ NRm::TKqpResourcesRequest{.MemoryPool = MemoryPool, .Memory = extraSize});
99
+
100
+ if (!result) {
105
101
LOG_W (" Can not allocate memory. TxId: " << TxId << " , taskId: " << TaskId << " , memory: +" << extraSize);
106
102
return false ;
107
103
}
@@ -292,8 +288,14 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
292
288
LOG_D (" TxId: " << txId << " , new compute tasks request from " << requester
293
289
<< " with " << msg.GetTasks ().size () << " tasks: " << TasksIdsStr (msg.GetTasks ()));
294
290
295
- NKqpNode::TTasksRequest request;
296
- request.Executer = ActorIdFromProto (msg.GetExecuterActorId ());
291
+ auto now = TAppData::TimeProvider->Now ();
292
+ NKqpNode::TTasksRequest request (txId, ev->Sender , now);
293
+ auto & msgRtSettings = msg.GetRuntimeSettings ();
294
+ if (msgRtSettings.GetTimeoutMs () > 0 ) {
295
+ // compute actor should not arm timer since in case of timeout it will receive TEvAbortExecution from Executer
296
+ auto timeout = TDuration::MilliSeconds (msgRtSettings.GetTimeoutMs ());
297
+ request.Deadline = now + timeout + /* gap */ TDuration::Seconds (5 );
298
+ }
297
299
298
300
auto & bucket = GetStateBucketByTx (Buckets, txId);
299
301
@@ -311,16 +313,6 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
311
313
memoryPool = NRm::EKqpMemoryPool::Unspecified;
312
314
}
313
315
314
- size_t executionUnits = msg.GetTasks ().size ();
315
- if (!ResourceManager ()->AllocateExecutionUnits (executionUnits)) {
316
- Counters->RmNotEnoughComputeActors ->Inc ();
317
- TStringBuilder error;
318
- error << " TxId: " << txId << " , NodeId: " << SelfId ().NodeId () << " , not enough compute actors, requested " << msg.GetTasks ().size ();
319
- LOG_N (error);
320
- ReplyError (txId, request.Executer , msg, NKikimrKqp::TEvStartKqpTasksResponse::NOT_ENOUGH_EXECUTION_UNITS, error);
321
- return ;
322
- }
323
-
324
316
ui32 requestChannels = 0 ;
325
317
for (auto & dqTask : *msg.MutableTasks ()) {
326
318
auto estimation = EstimateTaskResources (dqTask, Config, msg.GetTasks ().size ());
@@ -348,20 +340,20 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
348
340
for (auto & task : request.InFlyTasks ) {
349
341
NRm::TKqpResourcesRequest resourcesRequest;
350
342
resourcesRequest.MemoryPool = memoryPool;
343
+ resourcesRequest.ExecutionUnits = 1 ;
351
344
352
345
// !!!!!!!!!!!!!!!!!!!!!
353
346
// we have to allocate memory instead of reserve only. currently, this memory will not be used for request processing.
354
347
resourcesRequest.Memory = Min<double >(task.second .Memory , 1 << 19 ) /* 512kb limit for check that memory exists for processing with minimal requirements */ ;
355
348
356
- NRm::TKqpNotEnoughResources resourcesResponse;
357
- if (!ResourceManager ()->AllocateResources (txId, task.first , resourcesRequest, &resourcesResponse)) {
349
+ auto result = ResourceManager ()->AllocateResources (txId, task.first , resourcesRequest);
350
+
351
+ if (!result) {
358
352
for (ui64 taskId : allocatedTasks) {
359
353
ResourceManager ()->FreeResources (txId, taskId);
360
354
}
361
355
362
- ResourceManager ()->FreeExecutionUnits (executionUnits);
363
-
364
- ReplyError (txId, request.Executer , msg, resourcesResponse.GetStatus (), resourcesResponse.GetFailReason ());
356
+ ReplyError (txId, request.Executer , msg, result.GetStatus (), result.GetFailReason ());
365
357
return ;
366
358
}
367
359
@@ -377,14 +369,6 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
377
369
memoryLimits.MkqlHeavyProgramMemoryLimit = Config.GetMkqlHeavyProgramMemoryLimit ();
378
370
379
371
NYql::NDq::TComputeRuntimeSettings runtimeSettingsBase;
380
- auto & msgRtSettings = msg.GetRuntimeSettings ();
381
- if (msgRtSettings.GetTimeoutMs () > 0 ) {
382
- // compute actor should not arm timer since in case of timeout it will receive TEvAbortExecution from Executer
383
- auto timeout = TDuration::MilliSeconds (msgRtSettings.GetTimeoutMs ());
384
- request.Deadline = TAppData::TimeProvider->Now () + timeout + /* gap */ TDuration::Seconds (5 );
385
- bucket.InsertExpiringRequest (request.Deadline , txId, requester);
386
- }
387
-
388
372
runtimeSettingsBase.ExtraMemoryAllocationPool = memoryPool;
389
373
runtimeSettingsBase.FailOnUndelivery = msgRtSettings.GetExecType () != NYql::NDqProto::TComputeRuntimeSettings::SCAN;
390
374
@@ -502,7 +486,7 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
502
486
503
487
Counters->NodeServiceProcessTime ->Collect (NHPTimer::GetTimePassed (&workHandlerStart) * SecToUsec);
504
488
505
- bucket.NewRequest (txId, requester, std::move (request), memoryPool );
489
+ bucket.NewRequest (std::move (request));
506
490
}
507
491
508
492
// used only for unit tests
@@ -522,36 +506,30 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
522
506
Counters->NodeServiceProcessCancelTime ->Collect (timer.Passed () * SecToUsec);
523
507
}
524
508
525
- void TerminateTx (ui64 txId, const TString& reason) {
509
+ void TerminateTx (ui64 txId, const TString& reason, NYql::NDqProto::StatusIds_StatusCode status = NYql::NDqProto::StatusIds::UNSPECIFIED ) {
526
510
auto & bucket = GetStateBucketByTx (Buckets, txId);
527
511
auto tasksToAbort = bucket.GetTasksByTxId (txId);
528
512
529
513
if (!tasksToAbort.empty ()) {
514
+ TStringBuilder finalReason;
515
+ finalReason << " node service cancelled the task, because it " << reason
516
+ << " , NodeId: " << SelfId ().NodeId ()
517
+ << " , TxId: " << txId;
518
+
519
+ LOG_E (finalReason);
530
520
for (const auto & [taskId, computeActorId]: tasksToAbort) {
531
- auto abortEv = MakeHolder<TEvKqp::TEvAbortExecution>(NYql::NDqProto::StatusIds::UNSPECIFIED,
532
- reason);
533
- Send (computeActorId, abortEv.Release ());
521
+ auto abortEv = std::make_unique<TEvKqp::TEvAbortExecution>(status, reason);
522
+ Send (computeActorId, abortEv.release ());
534
523
}
535
524
}
536
525
}
537
526
538
527
void HandleWork (TEvents::TEvWakeup::TPtr& ev) {
539
528
Schedule (TDuration::Seconds (1 ), ev->Release ().Release ());
540
- std::vector<ui64> txIdsToFree;
541
529
for (auto & bucket : *Buckets) {
542
530
auto expiredRequests = bucket.ClearExpiredRequests ();
543
531
for (auto & cxt : expiredRequests) {
544
- LOG_D (" txId: " << cxt.RequestId .TxId << " , requester: " << cxt.RequestId .Requester
545
- << " , execution timeout, request: " << cxt.Exists );
546
- if (!cxt.Exists ) {
547
- // it is ok since in most cases requests is finished by exlicit TEvAbortExecution from their Executer
548
- LOG_I (" txId: " << cxt.RequestId .TxId << " , requester: " << cxt.RequestId .Requester
549
- << " , unknown request" );
550
- continue ;
551
- }
552
- // don't send to executer and compute actors, they will be destroyed by TEvAbortExecution in that order:
553
- // KqpProxy -> SessionActor -> Executer -> ComputeActor
554
- ResourceManager ()->FreeResources (cxt.RequestId .TxId );
532
+ TerminateTx (cxt.TxId , " reached execution deadline" , NYql::NDqProto::StatusIds::TIMEOUT);
555
533
}
556
534
}
557
535
}
@@ -619,9 +597,9 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
619
597
switch (ev->Get ()->SourceType ) {
620
598
case TEvKqpNode::TEvStartKqpTasksResponse::EventType: {
621
599
ui64 txId = ev->Cookie ;
622
- LOG_E ( " TxId: " << txId << " , executer lost: " << ( int ) ev-> Get ()-> Reason ) ;
623
-
624
- TerminateTx (txId, " executer lost " );
600
+ TStringBuilder reason ;
601
+ reason << " executer lost: " << ( int ) ev-> Get ()-> Reason ;
602
+ TerminateTx (txId, reason, NYql::NDqProto::StatusIds::ABORTED );
625
603
break ;
626
604
}
627
605
0 commit comments