Skip to content

Commit bf4f55b

Browse files
authored
GroupLayoutSanitizer always monitors invalid groups, add UTs (#15026)
1 parent 62279d7 commit bf4f55b

File tree

6 files changed

+57
-77
lines changed

6 files changed

+57
-77
lines changed

ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
2525
}
2626
}
2727

28-
void CreateEnv(std::unique_ptr<TEnvironmentSetup>& env, std::vector<TNodeLocation>& locations) {
29-
TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
28+
void CreateEnv(std::unique_ptr<TEnvironmentSetup>& env, std::vector<TNodeLocation>& locations,
29+
TBlobStorageGroupType groupType) {
3030
const ui32 numNodes = locations.size();
3131

3232
env.reset(new TEnvironmentSetup({
@@ -37,39 +37,49 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
3737

3838
const ui32 disksPerNode = 1;
3939
const ui32 slotsPerDisk = 3;
40+
41+
env->Runtime->FilterFunction = CatchSanitizeRequests;
4042
env->CreateBoxAndPool(disksPerNode, numNodes * disksPerNode * slotsPerDisk / 9);
43+
env->Runtime->FilterFunction = {};
4144
}
4245

43-
Y_UNIT_TEST(Test3dc) {
46+
NActorsInterconnect::TNodeLocation LocationGenerator(ui32 dc, ui32 rack, ui32 unit) {
47+
NActorsInterconnect::TNodeLocation proto;
48+
proto.SetDataCenter(ToString(dc));
49+
proto.SetRack(ToString(rack));
50+
proto.SetUnit(ToString(unit));
51+
return proto;
52+
}
53+
54+
void Test(TBlobStorageGroupType groupType, ui32 dcs, ui32 racks, ui32 units) {
4455
std::vector<TNodeLocation> locations;
45-
TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) {
46-
NActorsInterconnect::TNodeLocation proto;
47-
proto.SetDataCenter(ToString(dc));
48-
proto.SetRack(ToString(rack));
49-
proto.SetUnit(ToString(unit));
50-
return proto;
51-
};
5256

53-
MakeLocations(locations, 3, 5, 1, locationGenerator);
57+
MakeLocations(locations, dcs, racks, units, LocationGenerator);
5458
std::unique_ptr<TEnvironmentSetup> env;
55-
CreateEnv(env, locations);
5659

57-
TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
58-
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
60+
CreateEnv(env, locations, groupType);
5961

62+
63+
// Assure that sanitizer doesn't send request to initially allocated groups
6064
env->Runtime->FilterFunction = CatchSanitizeRequests;
65+
env->UpdateSettings(true, false, true);
66+
env->Sim(TDuration::Minutes(3));
67+
env->UpdateSettings(false, false, false);
68+
69+
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
6170

6271
TString error;
6372
auto cfg = env->FetchBaseConfig();
6473
UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error);
65-
env->Cleanup();
6674

6775
// Shuffle node locayion, assure that layout error occured
68-
std::random_shuffle(locations.begin(), locations.end());
69-
env->Initialize();
70-
env->Sim(TDuration::Seconds(100));
71-
cfg = env->FetchBaseConfig();
72-
CheckBaseConfigLayout(geom, cfg, true, error);
76+
do {
77+
env->Cleanup();
78+
std::random_shuffle(locations.begin(), locations.end());
79+
env->Initialize();
80+
env->Sim(TDuration::Seconds(100));
81+
cfg = env->FetchBaseConfig();
82+
} while (CheckBaseConfigLayout(geom, cfg, true, error));
7383
Cerr << error << Endl;
7484

7585
// Sanitize groups
@@ -86,6 +96,18 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
8696
UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error);
8797
}
8898

99+
Y_UNIT_TEST(Test3dc) {
100+
Test(TBlobStorageGroupType::ErasureMirror3dc, 3, 5, 1);
101+
}
102+
103+
Y_UNIT_TEST(TestBlock4Plus2) {
104+
Test(TBlobStorageGroupType::Erasure4Plus2Block, 1, 10, 2);
105+
}
106+
107+
Y_UNIT_TEST(TestMirror3of4) {
108+
Test(TBlobStorageGroupType::ErasureMirror3of4, 1, 10, 2);
109+
}
110+
89111
TString PrintGroups(TBlobStorageGroupType groupType, const NKikimrBlobStorage::TBaseConfig& cfg,
90112
std::vector<TNodeLocation> locations) {
91113
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
@@ -137,6 +159,7 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
137159
}
138160

139161
void TestMultipleRealmsOccupation(bool allowMultipleRealmsOccupation) {
162+
TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
140163
std::vector<TNodeLocation> locations;
141164
TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) {
142165
NActorsInterconnect::TNodeLocation proto;
@@ -152,9 +175,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
152175
};
153176
MakeLocations(locations, 4, 5, 1, locationGenerator);
154177
std::unique_ptr<TEnvironmentSetup> env;
155-
CreateEnv(env, locations);
178+
CreateEnv(env, locations, groupType);
156179

157-
TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
158180
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
159181

160182
env->Runtime->FilterFunction = CatchSanitizeRequests;
Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,3 @@
11
#include "group_layout_checker.h"
2-
#include "group_geometry_info.h"
3-
4-
namespace NKikimr::NBsController {
5-
6-
TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout) {
7-
using namespace NLayoutChecker;
8-
9-
if (layout.empty()) {
10-
return {};
11-
}
12-
13-
TBlobStorageGroupInfo::TTopology topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(),
14-
geom.GetNumVDisksPerFailDomain(), true);
15-
TGroupLayout group(topology);
16-
TDomainMapper mapper;
17-
THashMap<TVDiskIdShort, TPDiskLayoutPosition> map;
18-
for (const auto& [vdiskId, p] : layout) {
19-
const auto& [location, pdiskId] = p;
20-
TPDiskLayoutPosition pos(mapper, location, pdiskId, geom);
21-
group.AddDisk(pos, topology.GetOrderNumber(vdiskId));
22-
map.emplace(vdiskId, pos);
23-
}
24-
25-
std::vector<std::pair<TScore, TVDiskIdShort>> scoreboard;
26-
for (const auto& [vdiskId, pos] : map) {
27-
scoreboard.emplace_back(group.GetCandidateScore(pos, topology.GetOrderNumber(vdiskId)), vdiskId);
28-
}
29-
30-
auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; };
31-
std::sort(scoreboard.begin(), scoreboard.end(), comp1);
32-
33-
auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); };
34-
std::sort(scoreboard.begin(), scoreboard.end(), comp);
35-
TLayoutCheckResult res;
36-
const auto reference = scoreboard.back().first;
37-
if (!reference.SameAs({})) { // not perfectly correct layout
38-
for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) {
39-
res.Candidates.push_back(scoreboard.back().second);
40-
}
41-
}
42-
return res;
43-
}
44-
45-
} // NKikimr::NBsController
462

473
Y_DECLARE_OUT_SPEC(, NKikimr::NBsController::NLayoutChecker::TEntityId, stream, value) { value.Output(stream); }

ydb/core/mind/bscontroller/group_layout_checker.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,4 @@ namespace NKikimr::NBsController {
289289
}
290290
};
291291

292-
TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout);
293-
294292
} // NKikimr::NBsController

ydb/core/mind/bscontroller/impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1520,6 +1520,7 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
15201520
private:
15211521
TString InstanceId;
15221522
std::shared_ptr<std::atomic_uint64_t> SelfHealUnreassignableGroups = std::make_shared<std::atomic_uint64_t>();
1523+
std::shared_ptr<std::atomic_uint64_t> GroupLayoutSanitizerInvalidGroups = std::make_shared<std::atomic_uint64_t>();
15231524
TMaybe<TActorId> MigrationId;
15241525
TVSlots VSlots; // ordering is important
15251526
TPDisks PDisks; // ordering is important

ydb/core/mind/bscontroller/self_heal.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ namespace NKikimr::NBsController {
284284
bool DonorMode;
285285
THostRecordMap HostRecords;
286286
std::shared_ptr<TControlWrapper> EnableSelfHealWithDegraded;
287+
std::shared_ptr<std::atomic_uint64_t> GroupsWithInvalidLayoutCounter;
287288

288289
using TTopologyDescr = std::tuple<TBlobStorageGroupType::EErasureSpecies, ui32, ui32, ui32>;
289290
THashMap<TTopologyDescr, std::shared_ptr<TBlobStorageGroupInfo::TTopology>> Topologies;
@@ -296,14 +297,16 @@ namespace NKikimr::NBsController {
296297
public:
297298
TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups, THostRecordMap hostRecords,
298299
bool groupLayoutSanitizerEnabled, bool allowMultipleRealmsOccupation, bool donorMode,
299-
std::shared_ptr<TControlWrapper> enableSelfHealWithDegraded)
300+
std::shared_ptr<TControlWrapper> enableSelfHealWithDegraded,
301+
std::shared_ptr<std::atomic_uint64_t> groupsWithInvalidLayoutCounter)
300302
: TabletId(tabletId)
301303
, UnreassignableGroups(std::move(unreassignableGroups))
302304
, GroupLayoutSanitizerEnabled(groupLayoutSanitizerEnabled)
303305
, AllowMultipleRealmsOccupation(allowMultipleRealmsOccupation)
304306
, DonorMode(donorMode)
305307
, HostRecords(std::move(hostRecords))
306308
, EnableSelfHealWithDegraded(std::move(enableSelfHealWithDegraded))
309+
, GroupsWithInvalidLayoutCounter(std::move(groupsWithInvalidLayoutCounter))
307310
{}
308311

309312
void Bootstrap(const TActorId& parentId) {
@@ -318,17 +321,16 @@ namespace NKikimr::NBsController {
318321

319322
void Handle(TEvControllerUpdateSelfHealInfo::TPtr& ev) {
320323
if (const auto& setting = ev->Get()->GroupLayoutSanitizerEnabled) {
321-
bool previousSetting = std::exchange(GroupLayoutSanitizerEnabled, *setting);
322-
if (!previousSetting && GroupLayoutSanitizerEnabled) {
323-
UpdateLayoutInformationForAllGroups();
324-
}
324+
std::exchange(GroupLayoutSanitizerEnabled, *setting);
325325
}
326+
326327
if (const auto& setting = ev->Get()->AllowMultipleRealmsOccupation) {
327328
bool previousSetting = std::exchange(AllowMultipleRealmsOccupation, *setting);
328329
if (previousSetting != AllowMultipleRealmsOccupation) {
329330
UpdateLayoutInformationForAllGroups();
330331
}
331332
}
333+
332334
if (const auto& setting = ev->Get()->DonorMode) {
333335
DonorMode = *setting;
334336
}
@@ -345,9 +347,7 @@ namespace NKikimr::NBsController {
345347

346348
g.Content = std::move(*data);
347349

348-
if (GroupLayoutSanitizerEnabled) {
349-
UpdateGroupLayoutInformation(g);
350-
}
350+
UpdateGroupLayoutInformation(g);
351351

352352
ui32 numFailRealms = 0;
353353
ui32 numFailDomainsPerFailRealm = 0;
@@ -500,6 +500,7 @@ namespace NKikimr::NBsController {
500500
}
501501
}
502502

503+
GroupsWithInvalidLayoutCounter->store(GroupsWithInvalidLayout.Size());
503504
UnreassignableGroups->store(counter);
504505
}
505506

@@ -899,7 +900,7 @@ namespace NKikimr::NBsController {
899900
IActor *TBlobStorageController::CreateSelfHealActor() {
900901
Y_ABORT_UNLESS(HostRecords);
901902
return new TSelfHealActor(TabletID(), SelfHealUnreassignableGroups, HostRecords, GroupLayoutSanitizerEnabled,
902-
AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded);
903+
AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded, GroupLayoutSanitizerInvalidGroups);
903904
}
904905

905906
void TBlobStorageController::InitializeSelfHealState() {
@@ -1159,6 +1160,7 @@ namespace NKikimr::NBsController {
11591160
);
11601161

11611162
TabletCounters->Simple()[NBlobStorageController::COUNTER_SELF_HEAL_UNREASSIGNABLE_GROUPS] = SelfHealUnreassignableGroups->load();
1163+
TabletCounters->Simple()[NBlobStorageController::COUNTER_GROUP_LAYOUT_SANITIZER_INVALID_GROUPS] = GroupLayoutSanitizerInvalidGroups->load();
11621164

11631165
Schedule(TDuration::Seconds(15), new TEvPrivate::TEvUpdateSelfHealCounters);
11641166
}

ydb/core/protos/counters_bs_controller.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ enum ESimpleCounters {
2828
COUNTER_DISK_SCRUB_CUR_DISKS = 18 [(CounterOpts) = {Name: "CurrentlyScrubbedDisks"}];
2929
COUNTER_DISK_SCRUB_CUR_GROUPS = 19 [(CounterOpts) = {Name: "CurrentlyScrubbedGroups"}];
3030
COUNTER_SELF_HEAL_UNREASSIGNABLE_GROUPS = 20 [(CounterOpts) = {Name: "SelfHealUnreassignableGroups"}];
31+
COUNTER_GROUP_LAYOUT_SANITIZER_INVALID_GROUPS = 21 [(CounterOpts) = {Name: "GroupLayoutSanitizerInvlaidGroups"}];
3132
}
3233

3334
enum ECumulativeCounters {

0 commit comments

Comments
 (0)