From 1d225a1c6c16157282e1937cfb054ee4971c3460 Mon Sep 17 00:00:00 2001 From: Andrei Rykov Date: Wed, 17 Jan 2024 06:27:05 +0000 Subject: [PATCH 1/2] added RestartsPerPeriod info --- ydb/core/health_check/health_check.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index e4bce7b539d8..1f64437bbd2c 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -230,6 +230,7 @@ class TSelfCheckRequest : public TActorBootstrapped { TVector StoragePoolNames; THashMap, const NKikimrHive::TTabletInfo*> MergedTabletState; THashMap MergedNodeTabletState; + THashMap RestartsPerPeriod; ui64 StorageQuota; ui64 StorageUsage; }; @@ -1056,6 +1057,7 @@ class TSelfCheckRequest : public TActorBootstrapped { TString path(itFilterDomainKey->second); TDatabaseState& state(DatabaseState[path]); state.ComputeNodeIds.emplace_back(hiveStat.GetNodeId()); + state.RestartsPerPeriod[hiveStat.GetNodeId()] = hiveStat.GetRestartsPerPeriod(); } } } From 3cd00afee696b3c778c8c3a72a02c30d1c456eb2 Mon Sep 17 00:00:00 2001 From: Andrei Rykov Date: Wed, 17 Jan 2024 06:57:18 +0000 Subject: [PATCH 2/2] added issue --- ydb/core/health_check/health_check.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 1f64437bbd2c..4f2fcc5258cf 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -1248,9 +1248,13 @@ class TSelfCheckRequest : public TActorBootstrapped { } } - void FillComputeNodeStatus(TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { + void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node()); + if (databaseState.RestartsPerPeriod[nodeId] > 30) { + context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::NodeState); + } + auto itNodeSystemState = MergedNodeSystemState.find(nodeId); if (itNodeSystemState != MergedNodeSystemState.end()) { const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState(*itNodeSystemState->second); @@ -1308,8 +1312,9 @@ class TSelfCheckRequest : public TActorBootstrapped { } for (TNodeId nodeId : *computeNodeIds) { auto& computeNode = *computeStatus.add_nodes(); - FillComputeNodeStatus(nodeId, computeNode, {&context, "COMPUTE_NODE"}); + FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}); } + context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::NodeState}); context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState}); Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN; computeNodeIds->push_back(0); // for tablets without node