Skip to content

Commit eaae31c

Browse files
committed
Fix segfault in Group Layout Sanitizer, add stress tests with INACTIVE/FAULTY disks (ydb-platform#15516)
1 parent 9480de8 commit eaae31c

File tree

2 files changed

+133
-9
lines changed

2 files changed

+133
-9
lines changed

ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
#include <ydb/core/blobstorage/ut_blobstorage/lib/env.h>
2+
#include <ydb/core/blobstorage/ut_blobstorage/lib/common.h>
23
#include <ydb/core/mind/bscontroller/layout_helpers.h>
34

45
Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
6+
using NBsController::TPDiskId;
7+
using NKikimrBlobStorage::EDriveStatus;
8+
59
bool CatchSanitizeRequests(ui32 /*nodeId*/, std::unique_ptr<IEventHandle>& ev) {
610
if (ev->GetTypeRewrite() == TEvBlobStorage::TEvControllerConfigRequest::EventType) {
711
const auto& request = ev->Get<TEvBlobStorage::TEvControllerConfigRequest>()->Record.GetRequest();
@@ -216,4 +220,122 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
216220
Y_UNIT_TEST(ForbidMultipleRealmsOccupation) {
217221
TestMultipleRealmsOccupation(false);
218222
}
223+
224+
void StressTest(TBlobStorageGroupType groupType, ui32 dcs, ui32 racks, ui32 units) {
225+
const ui32 steps = 100;
226+
std::vector<TNodeLocation> locations;
227+
228+
MakeLocations(locations, dcs, racks, units, LocationGenerator);
229+
std::unique_ptr<TEnvironmentSetup> env;
230+
231+
CreateEnv(env, locations, groupType);
232+
env->Sim(TDuration::Minutes(3));
233+
env->UpdateSettings(false, false, false);
234+
235+
std::vector<TPDiskId> pdisks;
236+
237+
{
238+
auto cfg = env->FetchBaseConfig();
239+
for (const auto& pdisk : cfg.GetPDisk()) {
240+
pdisks.emplace_back(pdisk.GetNodeId(), pdisk.GetPDiskId());
241+
}
242+
}
243+
244+
auto shuffleLocations = [&]() {
245+
TString error;
246+
env->Cleanup();
247+
std::random_shuffle(locations.begin(), locations.end());
248+
env->Initialize();
249+
env->Sim(TDuration::Seconds(100));
250+
};
251+
252+
auto updateDriveStatus = [&](ui32 drives) {
253+
NKikimrBlobStorage::TConfigRequest request;
254+
request.SetIgnoreGroupFailModelChecks(true);
255+
request.SetIgnoreGroupSanityChecks(true);
256+
request.SetIgnoreDegradedGroupsChecks(true);
257+
request.SetIgnoreDisintegratedGroupsChecks(true);
258+
for (ui32 i = 0; i < drives; ++i) {
259+
auto* cmd = request.AddCommand();
260+
auto* drive = cmd->MutableUpdateDriveStatus();
261+
TPDiskId pdiskId = pdisks[RandomNumber<ui32>(pdisks.size())];
262+
drive->MutableHostKey()->SetNodeId(pdiskId.NodeId);
263+
drive->SetPDiskId(pdiskId.PDiskId);
264+
switch (RandomNumber<ui32>(7)) {
265+
case 0:
266+
drive->SetStatus(EDriveStatus::INACTIVE);
267+
break;
268+
case 1:
269+
drive->SetStatus(EDriveStatus::BROKEN);
270+
break;
271+
case 2:
272+
drive->SetStatus(EDriveStatus::FAULTY);
273+
break;
274+
default:
275+
drive->SetStatus(EDriveStatus::ACTIVE);
276+
}
277+
}
278+
279+
env->Invoke(request);
280+
};
281+
282+
enum class EActions {
283+
SHUFFLE_LOCATIONS = 0,
284+
UPDATE_STATUS,
285+
ENABLE_SANITIZER,
286+
DISABLE_SANITIZER,
287+
ENABLE_SELF_HEAL,
288+
DISABLE_SELF_HEAL,
289+
};
290+
TWeightedRandom<EActions> act;
291+
292+
act.AddValue(EActions::SHUFFLE_LOCATIONS, 1);
293+
act.AddValue(EActions::UPDATE_STATUS, 5);
294+
act.AddValue(EActions::ENABLE_SANITIZER, 1);
295+
act.AddValue(EActions::DISABLE_SANITIZER, 1);
296+
act.AddValue(EActions::ENABLE_SELF_HEAL, 1);
297+
act.AddValue(EActions::DISABLE_SELF_HEAL, 1);
298+
299+
bool selfHeal = false;
300+
bool groupLayoutSanitizer = false;
301+
302+
for (ui32 i = 0; i < steps; ++i) {
303+
switch (act.GetRandom()) {
304+
case EActions::SHUFFLE_LOCATIONS:
305+
shuffleLocations();
306+
break;
307+
case EActions::UPDATE_STATUS:
308+
updateDriveStatus(RandomNumber<ui32>(5) + 1);
309+
break;
310+
case EActions::ENABLE_SANITIZER:
311+
groupLayoutSanitizer = true;
312+
env->UpdateSettings(selfHeal, false, groupLayoutSanitizer);
313+
break;
314+
case EActions::DISABLE_SANITIZER:
315+
groupLayoutSanitizer = false;
316+
env->UpdateSettings(selfHeal, false, groupLayoutSanitizer);
317+
break;
318+
case EActions::ENABLE_SELF_HEAL:
319+
selfHeal = true;
320+
env->UpdateSettings(selfHeal, false, groupLayoutSanitizer);
321+
break;
322+
case EActions::DISABLE_SELF_HEAL:
323+
selfHeal = false;
324+
env->UpdateSettings(selfHeal, false, groupLayoutSanitizer);
325+
break;
326+
}
327+
}
328+
}
329+
330+
Y_UNIT_TEST(StressMirror3dc) {
331+
StressTest(TBlobStorageGroupType::ErasureMirror3dc, 3, 5, 1);
332+
}
333+
334+
Y_UNIT_TEST(StressBlock4Plus2) {
335+
StressTest(TBlobStorageGroupType::Erasure4Plus2Block, 1, 10, 2);
336+
}
337+
338+
Y_UNIT_TEST(StressMirror3of4) {
339+
StressTest(TBlobStorageGroupType::ErasureMirror3of4, 1, 10, 2);
340+
}
219341
}

ydb/core/mind/bscontroller/group_mapper.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -790,15 +790,17 @@ namespace NKikimr::NBsController {
790790
}
791791

792792
for (ui32 orderNum = 0; orderNum < group.size(); ++orderNum) {
793-
const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNum);
794-
ui32 pRealm = group[orderNum]->Position.Realm.Index();
795-
ui32 desiredPRealm = RealmNavigator[vdisk.FailRealm];
796-
if (pRealm != desiredPRealm) {
797-
if (realmOccupation[pRealm].size() > 1) {
798-
// disks from different fail realms in one Realm present
799-
failDetected(EFailLevel::REALM_FAIL, orderNum);
800-
} else {
801-
failDetected(EFailLevel::MULTIPLE_REALM_OCCUPATION, orderNum);
793+
if (group[orderNum]) {
794+
const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNum);
795+
ui32 pRealm = group[orderNum]->Position.Realm.Index();
796+
ui32 desiredPRealm = RealmNavigator[vdisk.FailRealm];
797+
if (pRealm != desiredPRealm) {
798+
if (realmOccupation[pRealm].size() > 1) {
799+
// disks from different fail realms in one Realm present
800+
failDetected(EFailLevel::REALM_FAIL, orderNum);
801+
} else {
802+
failDetected(EFailLevel::MULTIPLE_REALM_OCCUPATION, orderNum);
803+
}
802804
}
803805
}
804806
}

0 commit comments

Comments
 (0)