From 1d225a1c6c16157282e1937cfb054ee4971c3460 Mon Sep 17 00:00:00 2001 From: Andrei Rykov Date: Wed, 17 Jan 2024 06:27:05 +0000 Subject: [PATCH 1/6] added RestartsPerPeriod info --- ydb/core/health_check/health_check.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index e4bce7b539d8..1f64437bbd2c 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -230,6 +230,7 @@ class TSelfCheckRequest : public TActorBootstrapped { TVector StoragePoolNames; THashMap, const NKikimrHive::TTabletInfo*> MergedTabletState; THashMap MergedNodeTabletState; + THashMap RestartsPerPeriod; ui64 StorageQuota; ui64 StorageUsage; }; @@ -1056,6 +1057,7 @@ class TSelfCheckRequest : public TActorBootstrapped { TString path(itFilterDomainKey->second); TDatabaseState& state(DatabaseState[path]); state.ComputeNodeIds.emplace_back(hiveStat.GetNodeId()); + state.RestartsPerPeriod[hiveStat.GetNodeId()] = hiveStat.GetRestartsPerPeriod(); } } } From 3cd00afee696b3c778c8c3a72a02c30d1c456eb2 Mon Sep 17 00:00:00 2001 From: Andrei Rykov Date: Wed, 17 Jan 2024 06:57:18 +0000 Subject: [PATCH 2/6] added issue --- ydb/core/health_check/health_check.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 1f64437bbd2c..4f2fcc5258cf 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -1248,9 +1248,13 @@ class TSelfCheckRequest : public TActorBootstrapped { } } - void FillComputeNodeStatus(TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { + void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node()); + if (databaseState.RestartsPerPeriod[nodeId] > 30) { + context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::NodeState); + } + auto itNodeSystemState = MergedNodeSystemState.find(nodeId); if (itNodeSystemState != MergedNodeSystemState.end()) { const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState(*itNodeSystemState->second); @@ -1308,8 +1312,9 @@ class TSelfCheckRequest : public TActorBootstrapped { } for (TNodeId nodeId : *computeNodeIds) { auto& computeNode = *computeStatus.add_nodes(); - FillComputeNodeStatus(nodeId, computeNode, {&context, "COMPUTE_NODE"}); + FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}); } + context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::NodeState}); context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState}); Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN; computeNodeIds->push_back(0); // for tablets without node From 28f77b4672b685bc95ac6deb4af01f2dd0447aaa Mon Sep 17 00:00:00 2001 From: Andrei Rykov Date: Thu, 18 Jan 2024 01:05:46 +0000 Subject: [PATCH 3/6] node-check --- ydb/core/health_check/health_check.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 4f2fcc5258cf..c643831cb33f 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -141,6 +141,7 @@ class TSelfCheckRequest : public TActorBootstrapped { SystemTabletState, OverloadState, SyncState, + Uptime, }; struct TTenantInfo { @@ -230,7 +231,7 @@ class TSelfCheckRequest : public TActorBootstrapped { TVector StoragePoolNames; THashMap, const NKikimrHive::TTabletInfo*> MergedTabletState; THashMap MergedNodeTabletState; - THashMap RestartsPerPeriod; + THashMap NodeRestartsPerPeriod; ui64 StorageQuota; ui64 StorageUsage; }; @@ -1057,7 +1058,7 @@ class TSelfCheckRequest : public TActorBootstrapped { TString path(itFilterDomainKey->second); TDatabaseState& state(DatabaseState[path]); state.ComputeNodeIds.emplace_back(hiveStat.GetNodeId()); - state.RestartsPerPeriod[hiveStat.GetNodeId()] = hiveStat.GetRestartsPerPeriod(); + state.NodeRestartsPerPeriod[hiveStat.GetNodeId()] = hiveStat.GetRestartsPerPeriod(); } } } @@ -1251,8 +1252,13 @@ class TSelfCheckRequest : public TActorBootstrapped { void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node()); - if (databaseState.RestartsPerPeriod[nodeId] > 30) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::NodeState); + TSelfCheckContext rrContext(&context, "UPTIME"); + if (databaseState.NodeRestartsPerPeriod[nodeId] >= 30) { + rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::Uptime); + } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= 10) { + rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Node is restarting too often", ETags::Uptime); + } else { + rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); } auto itNodeSystemState = MergedNodeSystemState.find(nodeId); From f41531fd0736c7e13a6411aefb0181c2b28b5335 Mon Sep 17 00:00:00 2001 From: Andrei Rykov Date: Thu, 18 Jan 2024 01:35:06 +0000 Subject: [PATCH 4/6] changed message to NODE UPTIME --- ydb/core/health_check/health_check.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index c643831cb33f..524cb19cba10 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -1252,7 +1252,7 @@ class TSelfCheckRequest : public TActorBootstrapped { void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node()); - TSelfCheckContext rrContext(&context, "UPTIME"); + TSelfCheckContext rrContext(&context, "NODE UPTIME"); if (databaseState.NodeRestartsPerPeriod[nodeId] >= 30) { rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::Uptime); } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= 10) { @@ -1320,7 +1320,7 @@ class TSelfCheckRequest : public TActorBootstrapped { auto& computeNode = *computeStatus.add_nodes(); FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}); } - context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::NodeState}); + context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime}); context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState}); Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN; computeNodeIds->push_back(0); // for tablets without node From 7033417d4c98a18171e80b872a254dcd7d7af67a Mon Sep 17 00:00:00 2001 From: Andrei Rykov Date: Thu, 18 Jan 2024 11:43:15 +0000 Subject: [PATCH 5/6] changed messages --- ydb/core/health_check/health_check.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 524cb19cba10..397852b0ccdb 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -1252,11 +1252,11 @@ class TSelfCheckRequest : public TActorBootstrapped { void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node()); - TSelfCheckContext rrContext(&context, "NODE UPTIME"); + TSelfCheckContext rrContext(&context, "NODE_UPTIME"); if (databaseState.NodeRestartsPerPeriod[nodeId] >= 30) { rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::Uptime); } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= 10) { - rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Node is restarting too often", ETags::Uptime); + rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Node restart frequency is elevated", ETags::Uptime); } else { rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); } From 28b3cccc5092919e2713148fd2be032914e4e3ca Mon Sep 17 00:00:00 2001 From: Andrei Rykov Date: Thu, 18 Jan 2024 15:23:22 +0000 Subject: [PATCH 6/6] changed message --- ydb/core/health_check/health_check.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 397852b0ccdb..b34f3daea057 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -1256,7 +1256,7 @@ class TSelfCheckRequest : public TActorBootstrapped { if (databaseState.NodeRestartsPerPeriod[nodeId] >= 30) { rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::Uptime); } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= 10) { - rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Node restart frequency is elevated", ETags::Uptime); + rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "The number of node restarts has increased", ETags::Uptime); } else { rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); }