Skip to content

Commit 9d94100

Browse files
committed
don't lock on zero memory change (#6926)
1 parent 607b493 commit 9d94100

12 files changed

+330
-365
lines changed

ydb/core/kqp/compute_actor/kqp_compute_actor_factory.cpp

+21-23
Original file line numberDiff line numberDiff line change
@@ -12,75 +12,71 @@ struct TMemoryQuotaManager : public NYql::NDq::TGuaranteeQuotaManager {
1212
TMemoryQuotaManager(std::shared_ptr<NRm::IKqpResourceManager> resourceManager
1313
, NRm::EKqpMemoryPool memoryPool
1414
, std::shared_ptr<IKqpNodeState> state
15-
, ui64 txId
16-
, ui64 taskId
15+
, TIntrusivePtr<NRm::TTxState> tx
16+
, TIntrusivePtr<NRm::TTaskState> task
1717
, ui64 limit
1818
, ui64 reasonableSpillingTreshold)
1919
: NYql::NDq::TGuaranteeQuotaManager(limit, limit)
2020
, ResourceManager(std::move(resourceManager))
2121
, MemoryPool(memoryPool)
2222
, State(std::move(state))
23-
, TxId(txId)
24-
, TaskId(taskId)
23+
, Tx(std::move(tx))
24+
, Task(std::move(task))
2525
, ReasonableSpillingTreshold(reasonableSpillingTreshold)
2626
{
2727
}
2828

2929
~TMemoryQuotaManager() override {
3030
if (State) {
31-
State->OnTaskTerminate(TxId, TaskId, Success);
31+
State->OnTaskTerminate(Tx->TxId, Task->TaskId, Success);
3232
}
3333

34-
ResourceManager->FreeResources(TxId, TaskId);
34+
ResourceManager->FreeResources(Tx, Task);
3535
}
3636

3737
bool AllocateExtraQuota(ui64 extraSize) override {
38-
auto result = ResourceManager->AllocateResources(TxId, TaskId,
38+
auto result = ResourceManager->AllocateResources(Tx, Task,
3939
NRm::TKqpResourcesRequest{.MemoryPool = MemoryPool, .Memory = extraSize});
4040

4141
if (!result) {
4242
AFL_WARN(NKikimrServices::KQP_COMPUTE)
4343
("problem", "cannot_allocate_memory")
44-
("tx_id", TxId)
45-
("task_id", TaskId)
44+
("tx_id", Tx->TxId)
45+
("task_id", Task->TaskId)
4646
("memory", extraSize);
4747

4848
return false;
4949
}
5050

51-
TotalQueryAllocationsSize = result.TotalAllocatedQueryMemory;
52-
5351
return true;
5452
}
5553

5654
void FreeExtraQuota(ui64 extraSize) override {
57-
ResourceManager->FreeResources(TxId, TaskId,
58-
NRm::TKqpResourcesRequest{.MemoryPool = MemoryPool, .Memory = extraSize}
59-
);
55+
NRm::TKqpResourcesRequest request = NRm::TKqpResourcesRequest{.MemoryPool = MemoryPool, .Memory = extraSize};
56+
ResourceManager->FreeResources(Tx, Task, Task->FitRequest(request));
6057
}
6158

6259
bool IsReasonableToUseSpilling() const override {
63-
return TotalQueryAllocationsSize >= ReasonableSpillingTreshold;
60+
return Tx->GetExtraMemoryAllocatedSize() >= ReasonableSpillingTreshold;
6461
}
6562

6663
TString MemoryConsumptionDetails() const override {
67-
return ResourceManager->GetTxResourcesUsageDebugInfo(TxId);
64+
return Tx->ToString();
6865
}
6966

7067
void TerminateHandler(bool success, const NYql::TIssues& issues) {
7168
AFL_DEBUG(NKikimrServices::KQP_COMPUTE)
7269
("problem", "finish_compute_actor")
73-
("tx_id", TxId)("task_id", TaskId)("success", success)("message", issues.ToOneLineString());
70+
("tx_id", Tx->TxId)("task_id", Task->TaskId)("success", success)("message", issues.ToOneLineString());
7471
Success = success;
7572
}
7673

7774
std::shared_ptr<NRm::IKqpResourceManager> ResourceManager;
7875
NRm::EKqpMemoryPool MemoryPool;
7976
std::shared_ptr<IKqpNodeState> State;
80-
ui64 TxId;
81-
ui64 TaskId;
77+
TIntrusivePtr<NRm::TTxState> Tx;
78+
TIntrusivePtr<NRm::TTaskState> Task;
8279
bool Success = true;
83-
ui64 TotalQueryAllocationsSize = 0;
8480
ui64 ReasonableSpillingTreshold = 0;
8581
};
8682

@@ -126,8 +122,10 @@ class TKqpCaFactory : public IKqpNodeComputeActorFactory {
126122
resourcesRequest.ExecutionUnits = 1;
127123
resourcesRequest.Memory = memoryLimits.MkqlLightProgramMemoryLimit;
128124

125+
TIntrusivePtr<NRm::TTaskState> task = MakeIntrusive<NRm::TTaskState>(args.Task->GetId(), args.TxInfo->CreatedAt);
126+
129127
auto rmResult = ResourceManager_->AllocateResources(
130-
args.TxId, args.Task->GetId(), resourcesRequest);
128+
args.TxInfo, task, resourcesRequest);
131129

132130
if (!rmResult) {
133131
return NRm::TKqpRMAllocateResult{rmResult};
@@ -158,8 +156,8 @@ class TKqpCaFactory : public IKqpNodeComputeActorFactory {
158156
ResourceManager_,
159157
args.MemoryPool,
160158
std::move(args.State),
161-
args.TxId,
162-
args.Task->GetId(),
159+
std::move(args.TxInfo),
160+
std::move(task),
163161
limit,
164162
ReasonableSpillingTreshold.load());
165163

ydb/core/kqp/compute_actor/kqp_compute_actor_factory.h

+1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ struct IKqpNodeComputeActorFactory {
107107
const NActors::TActorId& ExecuterId;
108108
const ui64 TxId;
109109
NYql::NDqProto::TDqTask* Task;
110+
TIntrusivePtr<NRm::TTxState> TxInfo;
110111
const NYql::NDq::TComputeRuntimeSettings& RuntimeSettings;
111112
NWilson::TTraceId TraceId;
112113
TIntrusivePtr<NActors::TProtoArenaHolder> Arena;

ydb/core/kqp/counters/kqp_counters.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -776,7 +776,10 @@ TKqpCounters::TKqpCounters(const ::NMonitoring::TDynamicCounterPtr& counters, co
776776
RmExternalMemory = KqpGroup->GetCounter("RM/ExternalMemory", false);
777777
RmNotEnoughMemory = KqpGroup->GetCounter("RM/NotEnoughMemory", true);
778778
RmNotEnoughComputeActors = KqpGroup->GetCounter("RM/NotEnoughComputeActors", true);
779+
RmOnStartAllocs = KqpGroup->GetCounter("Rm/OnStartAllocs", true);
779780
RmExtraMemAllocs = KqpGroup->GetCounter("RM/ExtraMemAllocs", true);
781+
RmExtraMemFree = KqpGroup->GetCounter("RM/ExtraMemFree", true);
782+
RmOnCompleteFree = KqpGroup->GetCounter("RM/OnCompleteFree", true);
780783
RmInternalError = KqpGroup->GetCounter("RM/InternalError", true);
781784
RmSnapshotLatency = KqpGroup->GetHistogram(
782785
"RM/SnapshotLatency", NMonitoring::ExponentialHistogram(20, 2, 1));

ydb/core/kqp/counters/kqp_counters.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ class TKqpCounters : public TKqpCountersBase, public NYql::NDq::TSpillingCounter
350350
::NMonitoring::TDynamicCounterPtr WorkloadManagerGroup;
351351

352352
::NMonitoring::TDynamicCounters::TCounterPtr FullScansExecuted;
353-
353+
354354
// Lease updates counters
355355
::NMonitoring::THistogramPtr LeaseUpdateLatency;
356356
::NMonitoring::THistogramPtr RunActorLeaseUpdateBacklog;
@@ -377,6 +377,9 @@ class TKqpCounters : public TKqpCountersBase, public NYql::NDq::TSpillingCounter
377377
::NMonitoring::TDynamicCounters::TCounterPtr RmNotEnoughMemory;
378378
::NMonitoring::TDynamicCounters::TCounterPtr RmNotEnoughComputeActors;
379379
::NMonitoring::TDynamicCounters::TCounterPtr RmExtraMemAllocs;
380+
::NMonitoring::TDynamicCounters::TCounterPtr RmOnStartAllocs;
381+
::NMonitoring::TDynamicCounters::TCounterPtr RmExtraMemFree;
382+
::NMonitoring::TDynamicCounters::TCounterPtr RmOnCompleteFree;
380383
::NMonitoring::TDynamicCounters::TCounterPtr RmInternalError;
381384
NMonitoring::THistogramPtr RmSnapshotLatency;
382385
NMonitoring::THistogramPtr NodeServiceStartEventDelivery;

ydb/core/kqp/executer_actor/kqp_planner.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -346,11 +346,16 @@ TString TKqpPlanner::ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize)
346346
auto& task = TasksGraph.GetTask(taskId);
347347
NYql::NDqProto::TDqTask* taskDesc = ArenaSerializeTaskToProto(TasksGraph, task, true);
348348
NYql::NDq::TComputeRuntimeSettings settings;
349+
if (!TxInfo) {
350+
TxInfo = MakeIntrusive<NRm::TTxState>(
351+
TxId, TInstant::Now(), ResourceManager_->GetCounters());
352+
}
349353

350354
auto startResult = CaFactory_->CreateKqpComputeActor({
351355
.ExecuterId = ExecuterId,
352356
.TxId = TxId,
353357
.Task = taskDesc,
358+
.TxInfo = TxInfo,
354359
.RuntimeSettings = settings,
355360
.TraceId = NWilson::TTraceId(ExecuterSpan.GetTraceId()),
356361
.Arena = TasksGraph.GetMeta().GetArenaIntrusivePtr(),

ydb/core/kqp/executer_actor/kqp_planner.h

+1
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ class TKqpPlanner {
133133
TString SerializedGUCSettings;
134134
std::shared_ptr<NKikimr::NKqp::NRm::IKqpResourceManager> ResourceManager_;
135135
std::shared_ptr<NKikimr::NKqp::NComputeActor::IKqpNodeComputeActorFactory> CaFactory_;
136+
TIntrusivePtr<NRm::TTxState> TxInfo;
136137

137138
public:
138139
static bool UseMockEmptyPlanner; // for tests: if true then use TKqpMockEmptyPlanner that leads to the error

ydb/core/kqp/node_service/kqp_node_service.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,16 @@ class TKqpNodeService : public TActorBootstrapped<TKqpNodeService> {
183183
rlPath.ConstructInPlace(msgRtSettings.GetRlPath());
184184
}
185185

186+
TIntrusivePtr<NRm::TTxState> txInfo = MakeIntrusive<NRm::TTxState>(
187+
txId, TInstant::Now(), ResourceManager_->GetCounters());
188+
186189
const ui32 tasksCount = msg.GetTasks().size();
187190
for (auto& dqTask: *msg.MutableTasks()) {
188191
auto result = CaFactory_->CreateKqpComputeActor({
189192
.ExecuterId = request.Executer,
190193
.TxId = txId,
191194
.Task = &dqTask,
195+
.TxInfo = txInfo,
192196
.RuntimeSettings = runtimeSettingsBase,
193197
.TraceId = NWilson::TTraceId(ev->TraceId),
194198
.Arena = ev->Get()->Arena,

0 commit comments

Comments
 (0)