@@ -1248,9 +1248,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1248
1248
}
1249
1249
}
1250
1250
1251
- void FillComputeNodeStatus (TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1251
+ void FillComputeNodeStatus (TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1252
1252
FillNodeInfo (nodeId, context.Location .mutable_compute ()->mutable_node ());
1253
1253
1254
+ if (databaseState.RestartsPerPeriod [nodeId] > 30 ) {
1255
+ context.ReportStatus (Ydb::Monitoring::StatusFlag::RED, " Node is restarting too often" , ETags::NodeState);
1256
+ }
1257
+
1254
1258
auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
1255
1259
if (itNodeSystemState != MergedNodeSystemState.end ()) {
1256
1260
const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState (*itNodeSystemState->second );
@@ -1308,8 +1312,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
1308
1312
}
1309
1313
for (TNodeId nodeId : *computeNodeIds) {
1310
1314
auto & computeNode = *computeStatus.add_nodes ();
1311
- FillComputeNodeStatus (nodeId, computeNode, {&context, " COMPUTE_NODE" });
1315
+ FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" });
1312
1316
}
1317
+ context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::ComputeState, {ETags::NodeState});
1313
1318
context.ReportWithMaxChildStatus (" Compute is overloaded" , ETags::ComputeState, {ETags::OverloadState});
1314
1319
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
1315
1320
computeNodeIds->push_back (0 ); // for tablets without node
0 commit comments