Skip to content

Commit 1f5a4f3

Browse files
authored
extra details added to error message (#9539)
1 parent 5d28dd1 commit 1f5a4f3

File tree

2 files changed

+19
-4
lines changed

2 files changed

+19
-4
lines changed

ydb/core/kqp/rm_service/kqp_rm_service.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ class TKqpResourceManager : public IKqpResourceManager {
289289

290290
if (!hasScanQueryMemory) {
291291
Counters->RmNotEnoughMemory->Inc();
292+
tx->AckFailedMemoryAlloc(resources.Memory);
292293
TStringBuilder reason;
293294
reason << "TxId: " << txId << ", taskId: " << taskId << ". Not enough memory for query, requested: " << resources.Memory
294295
<< ". " << tx->ToString();
@@ -302,6 +303,7 @@ class TKqpResourceManager : public IKqpResourceManager {
302303
Y_DEFER {
303304
if (!result) {
304305
Counters->RmNotEnoughMemory->Inc();
306+
tx->AckFailedMemoryAlloc(resources.Memory);
305307
with_lock (Lock) {
306308
TotalMemoryResource->Release(resources.Memory);
307309
if (!tx->PoolId.empty()) {

ydb/core/kqp/rm_service/kqp_rm_service.h

+17-4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <util/datetime/base.h>
1212
#include <util/string/builder.h>
13+
#include <util/stream/format.h>
1314

1415
#include "kqp_resource_estimation.h"
1516

@@ -122,6 +123,8 @@ class TTxState : public TAtomicRefCount<TTxState> {
122123
std::atomic<ui64> TxScanQueryMemory = 0;
123124
std::atomic<ui64> TxExternalDataQueryMemory = 0;
124125
std::atomic<ui32> TxExecutionUnits = 0;
126+
std::atomic<ui64> TxMaxAllocation = 0;
127+
std::atomic<ui64> TxFailedAllocation = 0;
125128

126129
public:
127130
explicit TTxState(ui64 txId, TInstant now, TIntrusivePtr<TKqpCounters> counters, const TString& poolId, const double memoryPoolPercent,
@@ -145,12 +148,14 @@ class TTxState : public TAtomicRefCount<TTxState> {
145148

146149
if (!PoolId.empty()) {
147150
res << ", PoolId: " << PoolId
148-
<< ", MemoryPoolPercent: " << Sprintf("%.2f", MemoryPoolPercent);
151+
<< ", MemoryPoolPercent: " << Sprintf("%.2f", MemoryPoolPercent > 0 ? MemoryPoolPercent : 100);
149152
}
150153

151-
res << ", memory initially granted resources: " << TxExternalDataQueryMemory.load()
152-
<< ", tx total allocations " << TxScanQueryMemory.load()
153-
<< ", execution units: " << TxExecutionUnits.load()
154+
res << ", tx initially granted memory: " << HumanReadableSize(TxExternalDataQueryMemory.load(), SF_BYTES)
155+
<< ", tx total memory allocations: " << HumanReadableSize(TxScanQueryMemory.load(), SF_BYTES)
156+
<< ", tx largest successful memory allocation: " << HumanReadableSize(TxMaxAllocation.load(), SF_BYTES)
157+
<< ", tx largest failed memory allocation: " << HumanReadableSize(TxFailedAllocation.load(), SF_BYTES)
158+
<< ", tx total execution units: " << TxExecutionUnits.load()
154159
<< ", started at: " << CreatedAt
155160
<< " }";
156161

@@ -161,6 +166,11 @@ class TTxState : public TAtomicRefCount<TTxState> {
161166
return TxScanQueryMemory.load();
162167
}
163168

169+
void AckFailedMemoryAlloc(ui64 memory) {
170+
ui64 maxAlloc = TxFailedAllocation.load();
171+
while(maxAlloc < memory && !TxFailedAllocation.compare_exchange_weak(maxAlloc, memory));
172+
}
173+
164174
void Released(TIntrusivePtr<TTaskState>& taskState, const TKqpResourcesRequest& resources) {
165175
if (resources.ExecutionUnits) {
166176
Counters->RmOnCompleteFree->Inc();
@@ -176,6 +186,9 @@ class TTxState : public TAtomicRefCount<TTxState> {
176186
taskState->ScanQueryMemory -= resources.Memory;
177187
Counters->RmMemory->Sub(resources.Memory);
178188

189+
ui64 maxAlloc = TxMaxAllocation.load();
190+
while(maxAlloc < resources.Memory && !TxMaxAllocation.compare_exchange_weak(maxAlloc, resources.Memory));
191+
179192
TxExecutionUnits.fetch_sub(resources.ExecutionUnits);
180193
taskState->ExecutionUnits -= resources.ExecutionUnits;
181194
Counters->RmComputeActors->Sub(resources.ExecutionUnits);

0 commit comments

Comments
 (0)