Skip to content

Commit 5cd777d

Browse files
committed
Fix stats inside plan for full stats mode (ydb-platform#8553)
(cherry picked from commit 35c8dfe)
1 parent 26a122b commit 5cd777d

File tree

8 files changed

+78
-75
lines changed

8 files changed

+78
-75
lines changed

ydb/core/kqp/executer_actor/kqp_data_executer.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,7 @@ class TKqpDataExecuter : public TKqpExecuterBase<TKqpDataExecuter, EExecType::Da
216216
YqlIssue({}, TIssuesIds::KIKIMR_LOCKS_INVALIDATED, message));
217217
}
218218

219-
auto& response = *ResponseEv->Record.MutableResponse();
220-
221-
FillResponseStats(Ydb::StatusIds::SUCCESS);
219+
ResponseEv->Record.MutableResponse()->SetStatus(Ydb::StatusIds::SUCCESS);
222220
Counters->TxProxyMon->ReportStatusOK->Inc();
223221

224222
auto addLocks = [this](const auto& data) {
@@ -255,7 +253,7 @@ class TKqpDataExecuter : public TKqpExecuterBase<TKqpDataExecuter, EExecType::Da
255253
if (LockHandle) {
256254
ResponseEv->LockHandle = std::move(LockHandle);
257255
}
258-
BuildLocks(*response.MutableResult()->MutableLocks(), Locks);
256+
BuildLocks(*ResponseEv->Record.MutableResponse()->MutableResult()->MutableLocks(), Locks);
259257
}
260258

261259
auto resultSize = ResponseEv->GetByteSize();
@@ -281,7 +279,6 @@ class TKqpDataExecuter : public TKqpExecuterBase<TKqpDataExecuter, EExecType::Da
281279

282280
ExecuterSpan.EndOk();
283281

284-
Request.Transactions.crop(0);
285282
AlreadyReplied = true;
286283
PassAway();
287284
}

ydb/core/kqp/executer_actor/kqp_executer_impl.h

Lines changed: 60 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -177,10 +177,10 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
177177
}
178178

179179
void ReportEventElapsedTime() {
180-
if (Stats) {
181-
ui64 elapsedMicros = TlsActivationContext->GetCurrentEventTicksAsSeconds() * 1'000'000;
182-
Stats->ExecuterCpuTime += TDuration::MicroSeconds(elapsedMicros);
183-
}
180+
YQL_ENSURE(Stats);
181+
182+
ui64 elapsedMicros = TlsActivationContext->GetCurrentEventTicksAsSeconds() * 1'000'000;
183+
Stats->ExecuterCpuTime += TDuration::MicroSeconds(elapsedMicros);
184184
}
185185

186186
protected:
@@ -330,11 +330,10 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
330330
}
331331

332332
YQL_ENSURE(channel.DstTask == 0);
333+
YQL_ENSURE(Stats);
333334

334-
if (Stats) {
335-
Stats->ResultBytes += batch.Size();
336-
Stats->ResultRows += batch.RowCount();
337-
}
335+
Stats->ResultBytes += batch.Size();
336+
Stats->ResultRows += batch.RowCount();
338337

339338
LOG_T("Got result, channelId: " << channel.Id << ", shardId: " << task.Meta.ShardId
340339
<< ", inputIndex: " << channel.DstInputIndex << ", from: " << ev->Sender
@@ -391,7 +390,9 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
391390
<< ", state: " << NYql::NDqProto::EComputeState_Name((NYql::NDqProto::EComputeState) state.GetState())
392391
<< ", stats: " << state.GetStats());
393392

394-
if (Stats && state.HasStats() && Request.ProgressStatsPeriod) {
393+
YQL_ENSURE(Stats);
394+
395+
if (state.HasStats() && Request.ProgressStatsPeriod) {
395396
Stats->UpdateTaskStats(taskId, state.GetStats());
396397
auto now = TInstant::Now();
397398
if (LastProgressStats + Request.ProgressStatsPeriod <= now) {
@@ -418,13 +419,11 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
418419
if (Planner->CompletedCA(taskId, computeActor)) {
419420
ExtraData[computeActor].Swap(state.MutableExtraData());
420421

421-
if (Stats) {
422-
Stats->AddComputeActorStats(
423-
computeActor.NodeId(),
424-
std::move(*state.MutableStats()),
425-
TDuration::MilliSeconds(AggregationSettings.GetCollectLongTasksStatsTimeoutMs())
426-
);
427-
}
422+
Stats->AddComputeActorStats(
423+
computeActor.NodeId(),
424+
std::move(*state.MutableStats()),
425+
TDuration::MilliSeconds(AggregationSettings.GetCollectLongTasksStatsTimeoutMs())
426+
);
428427

429428
LastTaskId = taskId;
430429
LastComputeActorId = computeActor.ToString();
@@ -512,9 +511,9 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
512511
auto now = TAppData::TimeProvider->Now();
513512
StartResolveTime = now;
514513

515-
if (Stats) {
516-
Stats->StartTs = now;
517-
}
514+
YQL_ENSURE(Stats);
515+
516+
Stats->StartTs = now;
518517
}
519518

520519
TMaybe<size_t> FindReadRangesSource(const NKqpProto::TKqpPhyStage& stage) {
@@ -1148,8 +1147,9 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
11481147
: Nothing();
11491148

11501149
YQL_ENSURE(!shardsResolved || nodeId);
1150+
YQL_ENSURE(Stats);
11511151

1152-
if (shardId && Stats) {
1152+
if (shardId) {
11531153
Stats->AffectedShards.insert(*shardId);
11541154
}
11551155

@@ -1217,11 +1217,13 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
12171217

12181218
if (partitions.size() > 0 && source.GetSequentialInFlightShards() > 0 && partitions.size() > source.GetSequentialInFlightShards()) {
12191219
auto [startShard, shardInfo] = MakeVirtualTablePartition(source, stageInfo, HolderFactory(), TypeEnv());
1220-
if (Stats) {
1221-
for (auto& [shardId, _] : partitions) {
1222-
Stats->AffectedShards.insert(shardId);
1223-
}
1220+
1221+
YQL_ENSURE(Stats);
1222+
1223+
for (auto& [shardId, _] : partitions) {
1224+
Stats->AffectedShards.insert(shardId);
12241225
}
1226+
12251227
if (shardInfo.KeyReadRanges) {
12261228
addPartiton(startShard, {}, shardInfo, source.GetSequentialInFlightShards());
12271229
fillRangesForTasks();
@@ -1484,6 +1486,8 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
14841486
THashMap<ui64, ui64> assignedShardsCount;
14851487
auto& stage = stageInfo.Meta.GetStage(stageInfo.Id);
14861488

1489+
YQL_ENSURE(Stats);
1490+
14871491
const auto& tableInfo = stageInfo.Meta.TableConstInfo;
14881492
const auto& keyTypes = tableInfo->KeyColumnTypes;
14891493
ui32 metaId = 0;
@@ -1512,7 +1516,7 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
15121516
nodeShards[nodeId].emplace_back(TShardInfoWithId(i.first, std::move(i.second)));
15131517
}
15141518

1515-
if (Stats && CollectProfileStats(Request.StatsMode)) {
1519+
if (CollectProfileStats(Request.StatsMode)) {
15161520
for (auto&& i : nodeShards) {
15171521
Stats->AddNodeShardsCount(stageInfo.Id.StageId, i.first, i.second.size());
15181522
}
@@ -1697,7 +1701,7 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
16971701
ExecuterSpan.EndError(TStringBuilder() << NYql::NDqProto::StatusIds_StatusCode_Name(status));
16981702
}
16991703

1700-
FillResponseStats(Ydb::StatusIds::TIMEOUT);
1704+
ResponseEv->Record.MutableResponse()->SetStatus(Ydb::StatusIds::TIMEOUT);
17011705

17021706
// TEvAbortExecution can come from either ComputeActor or SessionActor (== Target).
17031707
if (abortSender != Target) {
@@ -1707,38 +1711,9 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
17071711

17081712
LOG_E("Sending timeout response to: " << Target);
17091713

1710-
Request.Transactions.crop(0);
17111714
this->Shutdown();
17121715
}
17131716

1714-
void FillResponseStats(Ydb::StatusIds::StatusCode status) {
1715-
auto& response = *ResponseEv->Record.MutableResponse();
1716-
1717-
response.SetStatus(status);
1718-
1719-
if (Stats) {
1720-
ReportEventElapsedTime();
1721-
1722-
Stats->FinishTs = TInstant::Now();
1723-
Stats->Finish();
1724-
1725-
if (Stats->CollectStatsByLongTasks || CollectFullStats(Request.StatsMode)) {
1726-
for (ui32 txId = 0; txId < Request.Transactions.size(); ++txId) {
1727-
const auto& tx = Request.Transactions[txId].Body;
1728-
auto planWithStats = AddExecStatsToTxPlan(tx->GetPlan(), response.GetResult().GetStats());
1729-
response.MutableResult()->MutableStats()->AddTxPlansWithStats(planWithStats);
1730-
}
1731-
}
1732-
1733-
if (Stats->CollectStatsByLongTasks) {
1734-
const auto& txPlansWithStats = response.GetResult().GetStats().GetTxPlansWithStats();
1735-
if (!txPlansWithStats.empty()) {
1736-
LOG_N("Full stats: " << txPlansWithStats);
1737-
}
1738-
}
1739-
}
1740-
}
1741-
17421717
virtual void ReplyErrorAndDie(Ydb::StatusIds::StatusCode status,
17431718
google::protobuf::RepeatedPtrField<Ydb::Issue::IssueMessage>* issues)
17441719
{
@@ -1752,8 +1727,7 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
17521727
AlreadyReplied = true;
17531728
auto& response = *ResponseEv->Record.MutableResponse();
17541729

1755-
FillResponseStats(status);
1756-
1730+
response.SetStatus(status);
17571731
response.MutableIssues()->Swap(issues);
17581732

17591733
LOG_T("ReplyErrorAndDie. Response: " << response.DebugString()
@@ -1772,7 +1746,6 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
17721746
ExecuterSpan.EndError(response.DebugString());
17731747
ExecuterStateSpan.EndError(response.DebugString());
17741748

1775-
Request.Transactions.crop(0);
17761749
this->Shutdown();
17771750
}
17781751

@@ -1850,8 +1823,35 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
18501823
void PassAway() override {
18511824
YQL_ENSURE(AlreadyReplied && ResponseEv);
18521825

1853-
// Actualize stats with the last stats from terminated CAs, but keep the status.
1854-
FillResponseStats(ResponseEv->Record.GetResponse().GetStatus());
1826+
// Fill response stats
1827+
{
1828+
auto& response = *ResponseEv->Record.MutableResponse();
1829+
1830+
YQL_ENSURE(Stats);
1831+
1832+
ReportEventElapsedTime();
1833+
1834+
Stats->FinishTs = TInstant::Now();
1835+
Stats->Finish();
1836+
1837+
if (Stats->CollectStatsByLongTasks || CollectFullStats(Request.StatsMode)) {
1838+
response.MutableResult()->MutableStats()->ClearTxPlansWithStats();
1839+
for (ui32 txId = 0; txId < Request.Transactions.size(); ++txId) {
1840+
const auto& tx = Request.Transactions[txId].Body;
1841+
auto planWithStats = AddExecStatsToTxPlan(tx->GetPlan(), response.GetResult().GetStats());
1842+
response.MutableResult()->MutableStats()->AddTxPlansWithStats(planWithStats);
1843+
}
1844+
}
1845+
1846+
if (Stats->CollectStatsByLongTasks) {
1847+
const auto& txPlansWithStats = response.GetResult().GetStats().GetTxPlansWithStats();
1848+
if (!txPlansWithStats.empty()) {
1849+
LOG_N("Full stats: " << response.GetResult().GetStats());
1850+
}
1851+
}
1852+
}
1853+
1854+
Request.Transactions.crop(0);
18551855
this->Send(Target, ResponseEv.release());
18561856

18571857
for (auto channelPair: ResultChannelProxies) {

ydb/core/kqp/executer_actor/kqp_executer_stats.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,10 @@ void TQueryExecutionStats::AddComputeActorStats(ui32 /* nodeId */, NYql::NDqProt
449449
}
450450

451451
// checking whether the task is long
452+
453+
// TODO(ilezhankin): investigate - for some reason `task.FinishTimeMs` may be large (or small?)
454+
// enough to result in an enormous duration - triggering the "long tasks" mode.
455+
452456
auto taskDuration = TDuration::MilliSeconds(task.GetFinishTimeMs() - task.GetStartTimeMs());
453457
bool longTask = taskDuration > collectLongTaskStatsTimeout;
454458
if (longTask) {
@@ -784,7 +788,7 @@ void TQueryExecutionStats::Finish() {
784788
Result->SetCpuTimeUs(Result->GetCpuTimeUs() + ExecuterCpuTime.MicroSeconds());
785789
Result->SetDurationUs(FinishTs.MicroSeconds() - StartTs.MicroSeconds());
786790

787-
// Result->Result* feilds are (temporary?) commented out in proto due to lack of use
791+
// Result->Result* fields are (temporary?) commented out in proto due to lack of use
788792
//
789793
// Result->SetResultBytes(ResultBytes);
790794
// Result->SetResultRows(ResultRows);

ydb/core/kqp/executer_actor/kqp_scan_executer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ class TKqpScanExecuter : public TKqpExecuterBase<TKqpScanExecuter, EExecType::Sc
278278
YQL_ENSURE(!AlreadyReplied);
279279
AlreadyReplied = true;
280280

281-
FillResponseStats(Ydb::StatusIds::SUCCESS);
281+
ResponseEv->Record.MutableResponse()->SetStatus(Ydb::StatusIds::SUCCESS);
282282

283283
LWTRACK(KqpScanExecuterFinalize, ResponseEv->Orbit, TxId, LastTaskId, LastComputeActorId, ResponseEv->ResultsSize());
284284

ydb/core/kqp/opt/kqp_query_plan.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2101,9 +2101,9 @@ NJson::TJsonValue ReconstructQueryPlanRec(const NJson::TJsonValue& plan,
21012101

21022102
newPlans.AppendValue(ReconstructQueryPlanRec(
21032103
plan.GetMapSafe().at("Plans").GetArraySafe()[0],
2104-
0,
2105-
planIndex,
2106-
precomputes,
2104+
0,
2105+
planIndex,
2106+
precomputes,
21072107
nodeCounter));
21082108

21092109
newPlans.AppendValue(lookupPlan);

ydb/core/kqp/rm_service/kqp_rm_service.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,10 @@ class TTxState : public TAtomicRefCount<TTxState> {
114114
{}
115115

116116
TString ToString() const {
117-
return TStringBuilder() << "TxResourcesInfo{ "
117+
return TStringBuilder() << "TxResourcesInfo { "
118118
<< "TxId: " << TxId
119119
<< ", memory initially granted resources: " << TxExternalDataQueryMemory.load()
120-
<< ", extra allocations " << TxScanQueryMemory.load()
120+
<< ", tx total allocations " << TxScanQueryMemory.load()
121121
<< ", execution units: " << TxExecutionUnits.load()
122122
<< ", started at: " << CreatedAt
123123
<< " }";

ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,12 +313,12 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped<TDerived>
313313
void OnMemoryLimitExceptionHandler() {
314314
TString memoryConsumptionDetails = MemoryLimits.MemoryQuotaManager->MemoryConsumptionDetails();
315315
TStringBuilder failureReason = TStringBuilder()
316-
<< "Mkql memory limit exceeded, limit: " << GetMkqlMemoryLimit()
316+
<< "Mkql memory limit exceeded, allocated by task " << Task.GetId() << ": " << GetMkqlMemoryLimit()
317317
<< ", host: " << HostName()
318318
<< ", canAllocateExtraMemory: " << CanAllocateExtraMemory;
319319

320320
if (!memoryConsumptionDetails.empty()) {
321-
failureReason << ", memory manager details: " << memoryConsumptionDetails;
321+
failureReason << ", memory manager details for current node: " << memoryConsumptionDetails;
322322
}
323323

324324
InternalError(NYql::NDqProto::StatusIds::OVERLOADED, TIssuesIds::KIKIMR_PRECONDITION_FAILED, failureReason);
@@ -1809,6 +1809,8 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped<TDerived>
18091809
}
18101810
}
18111811
}
1812+
} else {
1813+
// TODO: what should happen in this case?
18121814
}
18131815

18141816
static_cast<TDerived*>(this)->FillExtraStats(dst, last);

ydb/library/yql/dq/actors/protos/dq_stats.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ message TDqExecutionStats {
357357
uint64 CpuTimeUs = 1; // total cpu time, executer + compute actors + ...
358358
uint64 DurationUs = 2; // execution wall time
359359

360-
// these feilds are never used, needs to be reviewed
360+
// these fields are never used, needs to be reviewed
361361
reserved 3; // uint64 ResultRows = 3;
362362
reserved 4; // uint64 ResultBytes = 4;
363363

0 commit comments

Comments
 (0)