Skip to content

Commit f04a115

Browse files
authored
move allocate resources call before actor launch from node service to factory (#6583)
1 parent 91c2833 commit f04a115

File tree

5 files changed

+62
-44
lines changed

5 files changed

+62
-44
lines changed

ydb/core/kqp/compute_actor/kqp_compute_actor_factory.cpp

+12-1
Original file line numberDiff line numberDiff line change
@@ -114,13 +114,24 @@ class TKqpCaFactory : public IKqpNodeComputeActorFactory {
114114
ReasonableSpillingTreshold.store(config.GetReasonableSpillingTreshold());
115115
}
116116

117-
TActorId CreateKqpComputeActor(TCreateArgs&& args) {
117+
TActorStartResult CreateKqpComputeActor(TCreateArgs&& args) {
118118
NYql::NDq::TComputeMemoryLimits memoryLimits;
119119
memoryLimits.ChannelBufferSize = 0;
120120
memoryLimits.MkqlLightProgramMemoryLimit = MkqlLightProgramMemoryLimit.load();
121121
memoryLimits.MkqlHeavyProgramMemoryLimit = MkqlHeavyProgramMemoryLimit.load();
122122

123123
auto estimation = ResourceManager_->EstimateTaskResources(*args.Task, args.NumberOfTasks);
124+
NRm::TKqpResourcesRequest resourcesRequest;
125+
resourcesRequest.MemoryPool = args.MemoryPool;
126+
resourcesRequest.ExecutionUnits = 1;
127+
resourcesRequest.Memory = memoryLimits.MkqlLightProgramMemoryLimit;
128+
129+
auto rmResult = ResourceManager_->AllocateResources(
130+
args.TxId, args.Task->GetId(), resourcesRequest);
131+
132+
if (!rmResult) {
133+
return NRm::TKqpRMAllocateResult{rmResult};
134+
}
124135

125136
{
126137
ui32 inputChannelsCount = 0;

ydb/core/kqp/compute_actor/kqp_compute_actor_factory.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ struct IKqpNodeComputeActorFactory {
123123
std::shared_ptr<IKqpNodeState> State = nullptr;
124124
};
125125

126-
virtual NActors::TActorId CreateKqpComputeActor(TCreateArgs&& args) = 0;
126+
typedef std::variant<TActorId, NKikimr::NKqp::NRm::TKqpRMAllocateResult> TActorStartResult;
127+
virtual TActorStartResult CreateKqpComputeActor(TCreateArgs&& args) = 0;
127128

128129
virtual void ApplyConfig(const NKikimrConfig::TTableServiceConfig::TResourceManager& config) = 0;
129130
};

ydb/core/kqp/executer_actor/kqp_planner.cpp

+29-5
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ std::unique_ptr<TEvKqp::TEvAbortExecution> CheckTaskSize(ui64 TxId, const TIntru
3737
return nullptr;
3838
}
3939

40+
std::unique_ptr<IEventHandle> MakeActorStartFailureError(const TActorId& executerId, const TString& reason) {
41+
auto ev = std::make_unique<TEvKqp::TEvAbortExecution>(NYql::NDqProto::StatusIds::OVERLOADED, reason);
42+
return std::make_unique<IEventHandle>(executerId, executerId, ev.release());
43+
}
44+
4045
void BuildInitialTaskResources(const TKqpTasksGraph& graph, ui64 taskId, TTaskResourceEstimation& ret) {
4146
const auto& task = graph.GetTask(taskId);
4247
const auto& stageInfo = graph.GetStageInfo(task.StageId);
@@ -337,12 +342,12 @@ const IKqpGateway::TKqpSnapshot& TKqpPlanner::GetSnapshot() const {
337342

338343
// optimizeProtoForLocalExecution - if we want to execute compute actor locally and don't want to serialize & then deserialize proto message
339344
// instead we just give ptr to proto message and after that we swap/copy it
340-
void TKqpPlanner::ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize) {
345+
TString TKqpPlanner::ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize) {
341346
auto& task = TasksGraph.GetTask(taskId);
342347
NYql::NDqProto::TDqTask* taskDesc = ArenaSerializeTaskToProto(TasksGraph, task, true);
343348
NYql::NDq::TComputeRuntimeSettings settings;
344349

345-
task.ComputeActorId = CaFactory_->CreateKqpComputeActor({
350+
auto startResult = CaFactory_->CreateKqpComputeActor({
346351
.ExecuterId = ExecuterId,
347352
.TxId = TxId,
348353
.Task = taskDesc,
@@ -360,10 +365,19 @@ void TKqpPlanner::ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize) {
360365
.RlPath = Nothing()
361366
});
362367

368+
if (const auto* rmResult = std::get_if<NRm::TKqpRMAllocateResult>(&startResult)) {
369+
return rmResult->GetFailReason();
370+
}
371+
372+
TActorId* actorId = std::get_if<TActorId>(&startResult);
373+
Y_ABORT_UNLESS(actorId);
374+
task.ComputeActorId = *actorId;
375+
363376
LOG_D("Executing task: " << taskId << " on compute actor: " << task.ComputeActorId);
364377

365378
auto result = PendingComputeActors.emplace(task.ComputeActorId, TProgressStat());
366379
YQL_ENSURE(result.second);
380+
return TString();
367381
}
368382

369383
ui32 TKqpPlanner::GetnScanTasks() {
@@ -401,7 +415,10 @@ std::unique_ptr<IEventHandle> TKqpPlanner::PlanExecution() {
401415
// on datashard tx.
402416
if (LocalComputeTasks) {
403417
for (ui64 taskId : ComputeTasks) {
404-
ExecuteDataComputeTask(taskId, ComputeTasks.size());
418+
auto result = ExecuteDataComputeTask(taskId, ComputeTasks.size());
419+
if (!result.empty()) {
420+
return MakeActorStartFailureError(ExecuterId, result);
421+
}
405422
}
406423
ComputeTasks.clear();
407424
}
@@ -411,7 +428,10 @@ std::unique_ptr<IEventHandle> TKqpPlanner::PlanExecution() {
411428
// to execute this task locally so we can avoid useless overhead for remote task launching.
412429
for (auto& [shardId, tasks]: TasksPerNode) {
413430
for (ui64 taskId: tasks) {
414-
ExecuteDataComputeTask(taskId, tasks.size());
431+
auto result = ExecuteDataComputeTask(taskId, tasks.size());
432+
if (!result.empty()) {
433+
return MakeActorStartFailureError(ExecuterId, result);
434+
}
415435
}
416436
}
417437

@@ -437,7 +457,11 @@ std::unique_ptr<IEventHandle> TKqpPlanner::PlanExecution() {
437457
if (tasksOnNodeIt != TasksPerNode.end()) {
438458
auto& tasks = tasksOnNodeIt->second;
439459
for (ui64 taskId: tasks) {
440-
ExecuteDataComputeTask(taskId, tasks.size());
460+
auto result = ExecuteDataComputeTask(taskId, tasks.size());
461+
if (!result.empty()) {
462+
return MakeActorStartFailureError(ExecuterId, result);
463+
}
464+
441465
PendingComputeTasks.erase(taskId);
442466
}
443467
}

ydb/core/kqp/executer_actor/kqp_planner.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class TKqpPlanner {
8585
private:
8686

8787
const IKqpGateway::TKqpSnapshot& GetSnapshot() const;
88-
void ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize);
88+
TString ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize);
8989
void PrepareToProcess();
9090
TString GetEstimationsInfo() const;
9191

ydb/core/kqp/node_service/kqp_node_service.cpp

+18-36
Original file line numberDiff line numberDiff line change
@@ -164,35 +164,6 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
164164
memoryPool = NRm::EKqpMemoryPool::Unspecified;
165165
}
166166

167-
TVector<ui64> allocatedTasks;
168-
allocatedTasks.reserve(msg.GetTasks().size());
169-
for (auto& task : *msg.MutableTasks()) {
170-
NKqpNode::TTaskContext& taskCtx = request.InFlyTasks[task.GetId()];
171-
YQL_ENSURE(taskCtx.TaskId == 0);
172-
taskCtx.TaskId = task.GetId();
173-
174-
NRm::TKqpResourcesRequest resourcesRequest;
175-
resourcesRequest.MemoryPool = memoryPool;
176-
resourcesRequest.ExecutionUnits = 1;
177-
178-
// !!!!!!!!!!!!!!!!!!!!!
179-
// we have to allocate memory instead of reserve only. currently, this memory will not be used for request processing.
180-
resourcesRequest.Memory = (1 << 19) /* 512kb limit for check that memory exists for processing with minimal requirements */;
181-
182-
auto result = ResourceManager_->AllocateResources(txId, task.GetId(), resourcesRequest);
183-
184-
if (!result) {
185-
for (ui64 taskId : allocatedTasks) {
186-
ResourceManager_->FreeResources(txId, taskId);
187-
}
188-
189-
ReplyError(txId, request.Executer, msg, result.GetStatus(), result.GetFailReason());
190-
return;
191-
}
192-
193-
allocatedTasks.push_back(task.GetId());
194-
}
195-
196167
auto reply = MakeHolder<TEvKqpNode::TEvStartKqpTasksResponse>();
197168
reply->Record.SetTxId(txId);
198169

@@ -213,13 +184,8 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
213184
}
214185

215186
const ui32 tasksCount = msg.GetTasks().size();
216-
for (int i = 0; i < msg.GetTasks().size(); ++i) {
217-
auto& dqTask = *msg.MutableTasks(i);
218-
auto& taskCtx = request.InFlyTasks[dqTask.GetId()];
219-
taskCtx.TaskId = dqTask.GetId();
220-
YQL_ENSURE(taskCtx.TaskId != 0);
221-
222-
taskCtx.ComputeActorId = CaFactory_->CreateKqpComputeActor({
187+
for (auto& dqTask: *msg.MutableTasks()) {
188+
auto result = CaFactory_->CreateKqpComputeActor({
223189
.ExecuterId = request.Executer,
224190
.TxId = txId,
225191
.Task = &dqTask,
@@ -239,6 +205,22 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
239205
.State = State_
240206
});
241207

208+
if (const auto* rmResult = std::get_if<NRm::TKqpRMAllocateResult>(&result)) {
209+
ReplyError(txId, request.Executer, msg, rmResult->GetStatus(), rmResult->GetFailReason());
210+
bucket.NewRequest(std::move(request));
211+
TerminateTx(txId, rmResult->GetFailReason());
212+
return;
213+
}
214+
215+
auto& taskCtx = request.InFlyTasks[dqTask.GetId()];
216+
YQL_ENSURE(taskCtx.TaskId == 0);
217+
taskCtx.TaskId = dqTask.GetId();
218+
YQL_ENSURE(taskCtx.TaskId != 0);
219+
220+
TActorId* actorId = std::get_if<TActorId>(&result);
221+
Y_ABORT_UNLESS(actorId);
222+
taskCtx.ComputeActorId = *actorId;
223+
242224
LOG_D("TxId: " << txId << ", executing task: " << taskCtx.TaskId << " on compute actor: " << taskCtx.ComputeActorId);
243225

244226
auto* startedTask = reply->Record.AddStartedTasks();

0 commit comments

Comments
 (0)