Skip to content

Commit acc9844

Browse files
authored
storage health check fixes (#8291)
1 parent 3de3ff5 commit acc9844

File tree

2 files changed

+100
-58
lines changed

2 files changed

+100
-58
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
251251
struct TGroupState {
252252
TString ErasureSpecies;
253253
std::vector<const NKikimrSysView::TVSlotEntry*> VSlots;
254+
ui32 Generation;
254255
};
255256

256257
struct TSelfCheckResult {
@@ -1276,12 +1277,17 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
12761277
for (const auto& group : Groups->GetEntries()) {
12771278
auto groupId = group.GetKey().GetGroupId();
12781279
auto poolId = group.GetInfo().GetStoragePoolId();
1279-
GroupState[groupId].ErasureSpecies = group.GetInfo().GetErasureSpeciesV2();
1280+
auto& groupState = GroupState[groupId];
1281+
groupState.ErasureSpecies = group.GetInfo().GetErasureSpeciesV2();
1282+
groupState.Generation = group.GetInfo().GetGeneration();
12801283
StoragePoolState[poolId].Groups.emplace(groupId);
12811284
}
12821285
for (const auto& vSlot : VSlots->GetEntries()) {
12831286
auto vSlotId = GetVSlotId(vSlot.GetKey());
1284-
GroupState[vSlot.GetInfo().GetGroupId()].VSlots.push_back(&vSlot);
1287+
auto groupStateIt = GroupState.find(vSlot.GetInfo().GetGroupId());
1288+
if (groupStateIt != GroupState.end() && vSlot.GetInfo().GetGroupGeneration() == groupStateIt->second.Generation) {
1289+
groupStateIt->second.VSlots.push_back(&vSlot);
1290+
}
12851291
}
12861292
for (const auto& pool : StoragePools->GetEntries()) { // there is no specific pool for static group here
12871293
ui64 poolId = pool.GetKey().GetStoragePoolId();
@@ -1782,6 +1788,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
17821788

17831789
storageVDiskStatus.set_id(GetVSlotId(vSlot->GetKey()));
17841790

1791+
if (!vSlot->GetInfo().HasStatusV2()) {
1792+
// this should mean that BSC recently restarted and does not have accurate data yet - we should not report to avoid false positives
1793+
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
1794+
storageVDiskStatus.set_overall(context.GetOverallStatus());
1795+
return;
1796+
}
1797+
17851798
const auto *descriptor = NKikimrBlobStorage::EVDiskStatus_descriptor();
17861799
auto status = descriptor->FindValueByName(vSlot->GetInfo().GetStatusV2());
17871800
if (!status) { // this case is not expected because becouse bsc assignes status according EVDiskStatus enum
@@ -1801,16 +1814,12 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
18011814
storageVDiskStatus.set_overall(context.GetOverallStatus());
18021815
return;
18031816
}
1804-
case NKikimrBlobStorage::INIT_PENDING: { // initialization in process
1805-
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "VDisk is being initialized", ETags::VDiskState);
1806-
storageVDiskStatus.set_overall(context.GetOverallStatus());
1807-
return;
1808-
}
18091817
case NKikimrBlobStorage::REPLICATING: { // the disk accepts queries, but not all the data was replicated
18101818
context.ReportStatus(Ydb::Monitoring::StatusFlag::BLUE, TStringBuilder() << "Replication in progress", ETags::VDiskState);
18111819
storageVDiskStatus.set_overall(context.GetOverallStatus());
18121820
return;
18131821
}
1822+
case NKikimrBlobStorage::INIT_PENDING:
18141823
case NKikimrBlobStorage::READY: { // the disk is fully operational and does not affect group fault tolerance
18151824
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
18161825
}

0 commit comments

Comments
 (0)