Skip to content

Commit 0790396

Browse files
committed
add stats for queries with errors
1 parent 4efd471 commit 0790396

9 files changed

+206
-185
lines changed

ydb/core/kqp/executer_actor/kqp_data_executer.cpp

-34
Original file line numberDiff line numberDiff line change
@@ -204,40 +204,6 @@ class TKqpDataExecuter : public TKqpExecuterBase<TKqpDataExecuter, EExecType::Da
204204
);
205205
}
206206

207-
bool LogStatsByLongTasks() const {
208-
return Stats->CollectStatsByLongTasks && HasOlapTable;
209-
}
210-
211-
void FillResponseStats(Ydb::StatusIds::StatusCode status) {
212-
auto& response = *ResponseEv->Record.MutableResponse();
213-
214-
response.SetStatus(status);
215-
216-
if (Stats) {
217-
ReportEventElapsedTime();
218-
219-
Stats->FinishTs = TInstant::Now();
220-
Stats->Finish();
221-
222-
if (LogStatsByLongTasks() || CollectFullStats(Request.StatsMode)) {
223-
for (ui32 txId = 0; txId < Request.Transactions.size(); ++txId) {
224-
const auto& tx = Request.Transactions[txId].Body;
225-
auto planWithStats = AddExecStatsToTxPlan(tx->GetPlan(), response.GetResult().GetStats());
226-
response.MutableResult()->MutableStats()->AddTxPlansWithStats(planWithStats);
227-
}
228-
}
229-
230-
if (LogStatsByLongTasks()) {
231-
const auto& txPlansWithStats = response.GetResult().GetStats().GetTxPlansWithStats();
232-
if (!txPlansWithStats.empty()) {
233-
LOG_N("Full stats: " << txPlansWithStats);
234-
}
235-
}
236-
237-
Stats.reset();
238-
}
239-
}
240-
241207
void Finalize() {
242208
if (LocksBroken) {
243209
TString message = "Transaction locks invalidated.";

ydb/core/kqp/executer_actor/kqp_executer_impl.h

+58-88
Original file line numberDiff line numberDiff line change
@@ -408,86 +408,48 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
408408
}
409409
}
410410

411+
YQL_ENSURE(Planner);
412+
bool populateChannels = Planner->AcknowledgeCA(taskId, computeActor, &state);
413+
411414
switch (state.GetState()) {
412415
case NYql::NDqProto::COMPUTE_STATE_UNKNOWN: {
413416
YQL_ENSURE(false, "unexpected state from " << computeActor << ", task: " << taskId);
414417
return;
415418
}
416419

417-
case NYql::NDqProto::COMPUTE_STATE_FAILURE: {
418-
ReplyErrorAndDie(NYql::NDq::DqStatusToYdbStatus(state.GetStatusCode()), state.MutableIssues());
419-
return;
420-
}
421-
422420
case NYql::NDqProto::COMPUTE_STATE_EXECUTING: {
423-
// initial TEvState event from Compute Actor
424-
// there can be race with RM answer
425-
if (Planner) {
426-
if (Planner->GetPendingComputeTasks().erase(taskId)) {
427-
auto it = Planner->GetPendingComputeActors().emplace(computeActor, TProgressStat());
428-
YQL_ENSURE(it.second);
429-
430-
if (state.HasStats()) {
431-
it.first->second.Set(state.GetStats());
432-
}
433-
434-
auto& task = TasksGraph.GetTask(taskId);
435-
task.ComputeActorId = computeActor;
436-
437-
THashMap<TActorId, THashSet<ui64>> updates;
438-
CollectTaskChannelsUpdates(task, updates);
439-
PropagateChannelsUpdates(updates);
440-
} else {
441-
auto it = Planner->GetPendingComputeActors().find(computeActor);
442-
if (it != Planner->GetPendingComputeActors().end()) {
443-
if (state.HasStats()) {
444-
it->second.Set(state.GetStats());
445-
}
446-
}
447-
}
421+
if (populateChannels) {
422+
auto& task = TasksGraph.GetTask(taskId);
423+
THashMap<TActorId, THashSet<ui64>> updates;
424+
CollectTaskChannelsUpdates(task, updates);
425+
PropagateChannelsUpdates(updates);
448426
}
449427
break;
450428
}
451429

430+
case NYql::NDqProto::COMPUTE_STATE_FAILURE:
452431
case NYql::NDqProto::COMPUTE_STATE_FINISHED: {
432+
ExtraData[computeActor].Swap(state.MutableExtraData());
453433
if (Stats) {
454434
Stats->AddComputeActorStats(
455435
computeActor.NodeId(),
456436
std::move(*state.MutableStats()),
457437
TDuration::MilliSeconds(AggregationSettings.GetCollectLongTasksStatsTimeoutMs())
458438
);
459439
}
460-
ExtraData[computeActor].Swap(state.MutableExtraData());
461440

462441
LastTaskId = taskId;
463442
LastComputeActorId = computeActor.ToString();
464-
465-
if (Planner) {
466-
auto it = Planner->GetPendingComputeActors().find(computeActor);
467-
if (it == Planner->GetPendingComputeActors().end()) {
468-
LOG_W("Got execution state for compute actor: " << computeActor
469-
<< ", task: " << taskId
470-
<< ", state: " << NYql::NDqProto::EComputeState_Name((NYql::NDqProto::EComputeState) state.GetState())
471-
<< ", too early (waiting reply from RM)");
472-
473-
if (Planner && Planner->GetPendingComputeTasks().erase(taskId)) {
474-
LOG_E("Got execution state for compute actor: " << computeActor
475-
<< ", for unknown task: " << state.GetTaskId()
476-
<< ", state: " << NYql::NDqProto::EComputeState_Name((NYql::NDqProto::EComputeState) state.GetState()));
477-
return;
478-
}
479-
} else {
480-
if (state.HasStats()) {
481-
it->second.Set(state.GetStats());
482-
}
483-
LastStats.emplace_back(std::move(it->second));
484-
Planner->GetPendingComputeActors().erase(it);
485-
YQL_ENSURE(Planner->GetPendingComputeTasks().find(taskId) == Planner->GetPendingComputeTasks().end());
486-
}
487-
}
443+
YQL_ENSURE(Planner);
444+
Planner->CompletedCA(taskId, computeActor);
488445
}
489446
}
490447

448+
if (state.GetState() == NYql::NDqProto::COMPUTE_STATE_FAILURE) {
449+
ReplyErrorAndDie(NYql::NDq::DqStatusToYdbStatus(state.GetStatusCode()), state.MutableIssues());
450+
return;
451+
}
452+
491453
static_cast<TDerived*>(this)->CheckExecutionComplete();
492454
}
493455

@@ -683,20 +645,14 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
683645
auto taskId = startedTask.GetTaskId();
684646
auto& task = TasksGraph.GetTask(taskId);
685647

686-
task.ComputeActorId = ActorIdFromProto(startedTask.GetActorId());
687-
688-
LOG_D("Executing task: " << taskId << " on compute actor: " << task.ComputeActorId);
689-
690-
if (Planner) {
691-
if (Planner->GetPendingComputeTasks().erase(taskId) == 0) {
692-
LOG_D("Executing task: " << taskId << ", compute actor: " << task.ComputeActorId << ", already finished");
693-
} else {
694-
auto result = Planner->GetPendingComputeActors().emplace(std::make_pair(task.ComputeActorId, TProgressStat()));
695-
YQL_ENSURE(result.second);
696-
697-
CollectTaskChannelsUpdates(task, channelsUpdates);
698-
}
648+
TActorId computeActorId = ActorIdFromProto(startedTask.GetActorId());
649+
LOG_D("Executing task: " << taskId << " on compute actor: " << computeActorId);
650+
YQL_ENSURE(Planner);
651+
bool channelUpdates = Planner->AcknowledgeCA(taskId, computeActorId, nullptr);
652+
if (channelUpdates) {
653+
CollectTaskChannelsUpdates(task, channelsUpdates);
699654
}
655+
700656
}
701657

702658
PropagateChannelsUpdates(channelsUpdates);
@@ -789,16 +745,9 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
789745
LastResourceUsageUpdate = now;
790746

791747
TProgressStat::TEntry consumption;
792-
if (Planner) {
793-
for (const auto& p : Planner->GetPendingComputeActors()) {
794-
const auto& t = p.second.GetLastUsage();
795-
consumption += t;
796-
}
797-
}
798748

799-
for (const auto& p : LastStats) {
800-
const auto& t = p.GetLastUsage();
801-
consumption += t;
749+
if (Planner) {
750+
consumption += Planner->CalculateConsumptionUpdate();
802751
}
803752

804753
auto ru = NRuCalc::CalcRequestUnit(consumption);
@@ -811,13 +760,7 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
811760
return;
812761

813762
if (Planner) {
814-
for (auto& p : Planner->GetPendingComputeActors()) {
815-
p.second.Update();
816-
}
817-
}
818-
819-
for (auto& p : LastStats) {
820-
p.Update();
763+
Planner->ShiftConsumption();
821764
}
822765

823766
if (Request.RlPath) {
@@ -1758,7 +1701,7 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
17581701
ExecuterSpan.EndError(TStringBuilder() << NYql::NDqProto::StatusIds_StatusCode_Name(status));
17591702
}
17601703

1761-
static_cast<TDerived*>(this)->FillResponseStats(Ydb::StatusIds::TIMEOUT);
1704+
FillResponseStats(Ydb::StatusIds::TIMEOUT);
17621705

17631706
// TEvAbortExecution can come from either ComputeActor or SessionActor (== Target).
17641707
if (abortSender != Target) {
@@ -1775,6 +1718,34 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
17751718
this->PassAway();
17761719
}
17771720

1721+
void FillResponseStats(Ydb::StatusIds::StatusCode status) {
1722+
auto& response = *ResponseEv->Record.MutableResponse();
1723+
1724+
response.SetStatus(status);
1725+
1726+
if (Stats) {
1727+
ReportEventElapsedTime();
1728+
1729+
Stats->FinishTs = TInstant::Now();
1730+
Stats->Finish();
1731+
1732+
if (Stats->CollectStatsByLongTasks || CollectFullStats(Request.StatsMode)) {
1733+
for (ui32 txId = 0; txId < Request.Transactions.size(); ++txId) {
1734+
const auto& tx = Request.Transactions[txId].Body;
1735+
auto planWithStats = AddExecStatsToTxPlan(tx->GetPlan(), response.GetResult().GetStats());
1736+
response.MutableResult()->MutableStats()->AddTxPlansWithStats(planWithStats);
1737+
}
1738+
}
1739+
1740+
if (Stats->CollectStatsByLongTasks) {
1741+
const auto& txPlansWithStats = response.GetResult().GetStats().GetTxPlansWithStats();
1742+
if (!txPlansWithStats.empty()) {
1743+
LOG_N("Full stats: " << txPlansWithStats);
1744+
}
1745+
}
1746+
}
1747+
}
1748+
17781749
virtual void ReplyErrorAndDie(Ydb::StatusIds::StatusCode status,
17791750
google::protobuf::RepeatedPtrField<Ydb::Issue::IssueMessage>* issues)
17801751
{
@@ -1794,7 +1765,8 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
17941765
AlreadyReplied = true;
17951766
auto& response = *ResponseEv->Record.MutableResponse();
17961767

1797-
response.SetStatus(status);
1768+
FillResponseStats(status);
1769+
17981770
response.MutableIssues()->Swap(issues);
17991771

18001772
LOG_T("ReplyErrorAndDie. Response: " << response.DebugString()
@@ -1972,8 +1944,6 @@ class TKqpExecuterBase : public TActorBootstrapped<TDerived> {
19721944
TActorId KqpShardsResolverId;
19731945
THashMap<TActorId, NYql::NDqProto::TComputeActorExtraData> ExtraData;
19741946

1975-
TVector<TProgressStat> LastStats;
1976-
19771947
TInstant StartResolveTime;
19781948
TInstant LastResourceUsageUpdate;
19791949

ydb/core/kqp/executer_actor/kqp_planner.cpp

+71-10
Original file line numberDiff line numberDiff line change
@@ -389,12 +389,7 @@ TString TKqpPlanner::ExecuteDataComputeTask(ui64 taskId, ui32 computeTasksSize)
389389

390390
TActorId* actorId = std::get_if<TActorId>(&startResult);
391391
Y_ABORT_UNLESS(actorId);
392-
task.ComputeActorId = *actorId;
393-
394-
LOG_D("Executing task: " << taskId << " on compute actor: " << task.ComputeActorId);
395-
396-
auto result = PendingComputeActors.emplace(task.ComputeActorId, TProgressStat());
397-
YQL_ENSURE(result.second);
392+
AcknowledgeCA(taskId, *actorId, nullptr);
398393
return TString();
399394
}
400395

@@ -479,8 +474,6 @@ std::unique_ptr<IEventHandle> TKqpPlanner::PlanExecution() {
479474
if (!result.empty()) {
480475
return MakeActorStartFailureError(ExecuterId, result);
481476
}
482-
483-
PendingComputeTasks.erase(taskId);
484477
}
485478
}
486479
}
@@ -522,11 +515,79 @@ void TKqpPlanner::Unsubscribe() {
522515
}
523516
}
524517

525-
THashMap<TActorId, TProgressStat>& TKqpPlanner::GetPendingComputeActors() {
518+
bool TKqpPlanner::AcknowledgeCA(ui64 taskId, TActorId computeActor, const NYql::NDqProto::TEvComputeActorState* state) {
519+
auto& task = TasksGraph.GetTask(taskId);
520+
if (!task.ComputeActorId) {
521+
task.ComputeActorId = computeActor;
522+
PendingComputeTasks.erase(taskId);
523+
auto [it, success] = PendingComputeActors.try_emplace(computeActor);
524+
YQL_ENSURE(success);
525+
if (state && state->HasStats()) {
526+
it->second.Set(state->GetStats());
527+
}
528+
529+
return true;
530+
}
531+
532+
YQL_ENSURE(task.ComputeActorId == computeActor);
533+
auto it = PendingComputeActors.find(computeActor);
534+
if (!task.Meta.Completed) {
535+
YQL_ENSURE(it != PendingComputeActors.end());
536+
}
537+
538+
if (it != PendingComputeActors.end() && state && state->HasStats()) {
539+
it->second.Set(state->GetStats());
540+
}
541+
542+
return false;
543+
}
544+
545+
void TKqpPlanner::CompletedCA(ui64 taskId, TActorId computeActor) {
546+
auto& task = TasksGraph.GetTask(taskId);
547+
if (task.Meta.Completed) {
548+
YQL_ENSURE(!PendingComputeActors.contains(computeActor));
549+
return;
550+
}
551+
552+
task.Meta.Completed = true;
553+
auto it = PendingComputeActors.find(computeActor);
554+
YQL_ENSURE(it != PendingComputeActors.end());
555+
LastStats.emplace_back(std::move(it->second));
556+
PendingComputeActors.erase(it);
557+
return;
558+
}
559+
560+
TProgressStat::TEntry TKqpPlanner::CalculateConsumptionUpdate() {
561+
TProgressStat::TEntry consumption;
562+
563+
for (const auto& p : PendingComputeActors) {
564+
const auto& t = p.second.GetLastUsage();
565+
consumption += t;
566+
}
567+
568+
for (const auto& p : LastStats) {
569+
const auto& t = p.GetLastUsage();
570+
consumption += t;
571+
}
572+
573+
return consumption;
574+
}
575+
576+
void TKqpPlanner::ShiftConsumption() {
577+
for (auto& p : PendingComputeActors) {
578+
p.second.Update();
579+
}
580+
581+
for (auto& p : LastStats) {
582+
p.Update();
583+
}
584+
}
585+
586+
const THashMap<TActorId, TProgressStat>& TKqpPlanner::GetPendingComputeActors() {
526587
return PendingComputeActors;
527588
}
528589

529-
THashSet<ui64>& TKqpPlanner::GetPendingComputeTasks() {
590+
const THashSet<ui64>& TKqpPlanner::GetPendingComputeTasks() {
530591
return PendingComputeTasks;
531592
}
532593

0 commit comments

Comments
 (0)