diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index e4bce7b539d8..b34f3daea057 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -141,6 +141,7 @@ class TSelfCheckRequest : public TActorBootstrapped { SystemTabletState, OverloadState, SyncState, + Uptime, }; struct TTenantInfo { @@ -230,6 +231,7 @@ class TSelfCheckRequest : public TActorBootstrapped { TVector StoragePoolNames; THashMap, const NKikimrHive::TTabletInfo*> MergedTabletState; THashMap MergedNodeTabletState; + THashMap NodeRestartsPerPeriod; ui64 StorageQuota; ui64 StorageUsage; }; @@ -1056,6 +1058,7 @@ class TSelfCheckRequest : public TActorBootstrapped { TString path(itFilterDomainKey->second); TDatabaseState& state(DatabaseState[path]); state.ComputeNodeIds.emplace_back(hiveStat.GetNodeId()); + state.NodeRestartsPerPeriod[hiveStat.GetNodeId()] = hiveStat.GetRestartsPerPeriod(); } } } @@ -1246,9 +1249,18 @@ class TSelfCheckRequest : public TActorBootstrapped { } } - void FillComputeNodeStatus(TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { + void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node()); + TSelfCheckContext rrContext(&context, "NODE_UPTIME"); + if (databaseState.NodeRestartsPerPeriod[nodeId] >= 30) { + rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::Uptime); + } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= 10) { + rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "The number of node restarts has increased", ETags::Uptime); + } else { + rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); + } + auto itNodeSystemState = MergedNodeSystemState.find(nodeId); if (itNodeSystemState != MergedNodeSystemState.end()) { const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState(*itNodeSystemState->second); @@ -1306,8 +1318,9 @@ class TSelfCheckRequest : public TActorBootstrapped { } for (TNodeId nodeId : *computeNodeIds) { auto& computeNode = *computeStatus.add_nodes(); - FillComputeNodeStatus(nodeId, computeNode, {&context, "COMPUTE_NODE"}); + FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}); } + context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime}); context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState}); Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN; computeNodeIds->push_back(0); // for tablets without node