Skip to content

Commit 7beef2e

Browse files
authored
storage healthcheck fixes (#7212) (#7394)
1 parent 72e974f commit 7beef2e

File tree

2 files changed

+46
-15
lines changed

2 files changed

+46
-15
lines changed

ydb/core/health_check/health_check.cpp

+7-7
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ struct hash<NKikimrBlobStorage::TVSlotId> {
5656
}
5757

5858
#define BLOG_CRIT(stream) LOG_CRIT_S(*TlsActivationContext, NKikimrServices::HEALTH, stream)
59+
#define BLOG_D(stream) LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::HEALTH, stream)
5960

6061
namespace NKikimr {
6162

@@ -643,7 +644,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
643644
}
644645

645646
bool NeedWhiteboardInfoForGroup(TGroupId groupId) {
646-
return !HaveAllBSControllerInfo() && IsStaticGroup(groupId);
647+
return UnknownStaticGroups.contains(groupId) || (!HaveAllBSControllerInfo() && IsStaticGroup(groupId));
647648
}
648649

649650
void Handle(TEvNodeWardenStorageConfig::TPtr ev) {
@@ -678,6 +679,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
678679

679680
auto groupId = vDisk.GetVDiskID().GetGroupID();
680681
if (NeedWhiteboardInfoForGroup(groupId)) {
682+
BLOG_D("Requesting whiteboard for group " << groupId);
681683
RequestStorageNode(vDisk.GetVDiskLocation().GetNodeID());
682684
}
683685
}
@@ -1308,6 +1310,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
13081310
// it should not be trusted
13091311
Ydb::Monitoring::StorageGroupStatus staticGroupStatus;
13101312
FillGroupStatus(0, staticGroupStatus, {nullptr});
1313+
BLOG_D("Static group status is " << staticGroupStatus.overall());
13111314
if (staticGroupStatus.overall() != Ydb::Monitoring::StatusFlag::GREEN) {
13121315
UnknownStaticGroups.emplace(0);
13131316
RequestStorageConfig();
@@ -1712,12 +1715,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
17121715
ETags::PDiskState);
17131716
}
17141717
switch (status->number()) {
1715-
case NKikimrBlobStorage::ACTIVE: {
1716-
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
1717-
break;
1718-
}
1718+
case NKikimrBlobStorage::ACTIVE:
17191719
case NKikimrBlobStorage::INACTIVE: {
1720-
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "PDisk is inactive", ETags::PDiskState);
1720+
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
17211721
break;
17221722
}
17231723
case NKikimrBlobStorage::FAULTY:
@@ -2169,7 +2169,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21692169
context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::YELLOW);
21702170
checker.ReportStatus(context);
21712171

2172-
2172+
BLOG_D("Group " << groupId << " has status " << context.GetOverallStatus());
21732173
storageGroupStatus.set_overall(context.GetOverallStatus());
21742174
}
21752175

ydb/core/health_check/health_check_ut.cpp

+39-8
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
163163
}
164164

165165
void AddVSlotsToSysViewResponse(NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr* ev, size_t groupCount,
166-
const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses) {
166+
const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses, ui32 groupStartId = GROUP_START_ID) {
167167
auto& record = (*ev)->Get()->Record;
168168
auto entrySample = record.entries(0);
169169
record.clear_entries();
170170

171-
auto groupId = GROUP_START_ID;
171+
auto groupId = groupStartId;
172172
const auto *descriptor = NKikimrBlobStorage::EVDiskStatus_descriptor();
173173
for (size_t i = 0; i < groupCount; ++i) {
174174
auto vslotId = VCARD_START_ID;
@@ -252,13 +252,13 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
252252
sPool->set_name(STORAGE_POOL_NAME);
253253
};
254254

255-
void AddVSlotInVDiskStateResponse(TEvWhiteboard::TEvVDiskStateResponse::TPtr* ev, int groupCount, int vslotCount) {
255+
void AddVSlotInVDiskStateResponse(TEvWhiteboard::TEvVDiskStateResponse::TPtr* ev, int groupCount, int vslotCount, ui32 groupStartId = GROUP_START_ID) {
256256
auto& pbRecord = (*ev)->Get()->Record;
257257

258258
auto sample = pbRecord.vdiskstateinfo(0);
259259
pbRecord.clear_vdiskstateinfo();
260260

261-
auto groupId = GROUP_START_ID;
261+
auto groupId = groupStartId;
262262
for (int i = 0; i < groupCount; i++) {
263263
auto slotId = VCARD_START_ID;
264264
for (int j = 0; j < vslotCount; j++) {
@@ -273,6 +273,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
273273
}
274274
}
275275

276+
void ChangeGroupStateResponse(NNodeWhiteboard::TEvWhiteboard::TEvBSGroupStateResponse::TPtr* ev) {
277+
for (auto& groupInfo : *(*ev)->Get()->Record.mutable_bsgroupstateinfo()) {
278+
groupInfo.set_erasurespecies(NHealthCheck::TSelfCheckRequest::BLOCK_4_2);
279+
}
280+
}
281+
276282
void SetLongHostValue(TEvInterconnect::TEvNodesInfo::TPtr* ev) {
277283
TString host(1000000, 'a');
278284
auto& pbRecord = (*ev)->Get()->Nodes;
@@ -383,7 +389,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
383389
CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
384390
}
385391

386-
Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses) {
392+
Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses, bool forStaticGroup = false) {
387393
TPortManager tp;
388394
ui16 port = tp.GetPort(2134);
389395
ui16 grpcPort = tp.GetPort(2135);
@@ -418,7 +424,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
418424
}
419425
case NSysView::TEvSysView::EvGetVSlotsResponse: {
420426
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr*>(&ev);
421-
AddVSlotsToSysViewResponse(x, 1, vdiskStatuses);
427+
if (forStaticGroup) {
428+
AddVSlotsToSysViewResponse(x, 1, vdiskStatuses, 0);
429+
} else {
430+
AddVSlotsToSysViewResponse(x, 1, vdiskStatuses);
431+
}
422432
break;
423433
}
424434
case NSysView::TEvSysView::EvGetGroupsResponse: {
@@ -431,6 +441,19 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
431441
AddStoragePoolsToSysViewResponse(x);
432442
break;
433443
}
444+
case NNodeWhiteboard::TEvWhiteboard::EvVDiskStateResponse: {
445+
auto *x = reinterpret_cast<NNodeWhiteboard::TEvWhiteboard::TEvVDiskStateResponse::TPtr*>(&ev);
446+
if (forStaticGroup) {
447+
AddVSlotInVDiskStateResponse(x, 1, vdiskStatuses.size(), 0);
448+
} else {
449+
AddVSlotInVDiskStateResponse(x, 1, vdiskStatuses.size());
450+
}
451+
break;
452+
}
453+
case NNodeWhiteboard::TEvWhiteboard::EvBSGroupStateResponse: {
454+
auto* x = reinterpret_cast<NNodeWhiteboard::TEvWhiteboard::TEvBSGroupStateResponse::TPtr*>(&ev);
455+
ChangeGroupStateResponse(x);
456+
}
434457
}
435458

436459
return TTestActorRuntime::EEventAction::PROCESS;
@@ -444,10 +467,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
444467
return runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
445468
}
446469

447-
void CheckHcResultHasIssuesWithStatus(Ydb::Monitoring::SelfCheckResult& result, const TString& type, const Ydb::Monitoring::StatusFlag::Status expectingStatus, ui32 total) {
470+
void CheckHcResultHasIssuesWithStatus(Ydb::Monitoring::SelfCheckResult& result, const TString& type,
471+
const Ydb::Monitoring::StatusFlag::Status expectingStatus, ui32 total,
472+
std::string_view pool = "/Root:test") {
448473
int issuesCount = 0;
449474
for (const auto& issue_log : result.Getissue_log()) {
450-
if (issue_log.type() == type && issue_log.location().storage().pool().name() == "/Root:test" && issue_log.status() == expectingStatus) {
475+
if (issue_log.type() == type && issue_log.location().storage().pool().name() == pool && issue_log.status() == expectingStatus) {
451476
issuesCount++;
452477
}
453478
}
@@ -589,6 +614,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
589614
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
590615
}
591616

617+
Y_UNIT_TEST(StaticGroupIssue) {
618+
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, {NKikimrBlobStorage::ERROR}, /*forStatic*/ true);
619+
Cerr << result.ShortDebugString() << Endl;
620+
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1, "static");
621+
}
622+
592623
/* HC currently infers group status on its own, so it's never unknown
593624
Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
594625
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});

0 commit comments

Comments
 (0)