Skip to content

Commit 8c1f7ef

Browse files
authored
Storage Balancer KIKIMR-20636 (#770)
1 parent eee3fb5 commit 8c1f7ef

16 files changed

+412
-2
lines changed

ydb/core/mind/hive/balancer.h

+4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#pragma once
22

33
#include "hive_impl.h"
4+
#include "leader_tablet_info.h"
45

56
namespace NKikimr {
67
namespace NHive {
@@ -11,5 +12,8 @@ void BalanceNodes(std::vector<TNodeInfo*>& nodes, EResourceToBalance resourceTob
1112
template<NKikimrConfig::THiveConfig::EHiveTabletBalanceStrategy EHiveTabletBalanceStrategy>
1213
void BalanceTablets(std::vector<TTabletInfo*>& tablets, EResourceToBalance resourceToBalance);
1314

15+
template <NKikimrConfig::THiveConfig::EHiveChannelBalanceStrategy>
16+
void BalanceChannels(std::vector<TLeaderTabletInfo::TChannel>& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance);
17+
1418
}
1519
}

ydb/core/mind/hive/hive.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ TString EBalancerTypeName(EBalancerType value) {
4141
case EBalancerType::Emergency: return "Emergency";
4242
case EBalancerType::SpreadNeighbours: return "Spread";
4343
case EBalancerType::Manual: return "Manual";
44+
case EBalancerType::Storage: return "Storage";
4445
}
4546
}
4647

ydb/core/mind/hive/hive.h

+8-1
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,9 @@ enum class EBalancerType {
8585
ScatterNetwork,
8686
Emergency,
8787
SpreadNeighbours,
88+
Storage,
8889

89-
Last = SpreadNeighbours,
90+
Last = Storage,
9091
};
9192

9293
constexpr std::size_t EBalancerTypeSize = static_cast<std::size_t>(EBalancerType::Last) + 1;
@@ -261,6 +262,12 @@ struct TBalancerSettings {
261262
std::optional<TFullObjectId> FilterObjectId;
262263
};
263264

265+
struct TStorageBalancerSettings {
266+
ui64 NumReassigns;
267+
ui64 MaxInFlight;
268+
TString StoragePool;
269+
};
270+
264271
struct TBalancerStats {
265272
ui64 TotalRuns = 0;
266273
ui64 TotalMovements = 0;

ydb/core/mind/hive/hive_events.h

+14
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ struct TEvPrivate {
2727
EvProcessIncomingEvent,
2828
EvRefreshStorageInfo,
2929
EvLogTabletMoves,
30+
EvStartStorageBalancer,
31+
EvRestartCancelled,
3032
EvEnd
3133
};
3234

@@ -90,6 +92,18 @@ struct TEvPrivate {
9092
struct TEvRefreshStorageInfo : TEventLocal<TEvRefreshStorageInfo, EvRefreshStorageInfo> {};
9193

9294
struct TEvLogTabletMoves : TEventLocal<TEvLogTabletMoves, EvLogTabletMoves> {};
95+
96+
struct TEvStartStorageBalancer : TEventLocal<TEvStartStorageBalancer, EvStartStorageBalancer> {
97+
TStorageBalancerSettings Settings;
98+
99+
TEvStartStorageBalancer(TStorageBalancerSettings settings) : Settings(settings) {}
100+
};
101+
102+
struct TEvRestartCancelled : TEventLocal<TEvRestartCancelled, EvRestartCancelled> {
103+
TFullTabletId TabletId;
104+
105+
TEvRestartCancelled(TFullTabletId tabletId) : TabletId(tabletId) {}
106+
};
93107
};
94108

95109
} // NHive

ydb/core/mind/hive/hive_impl.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,12 @@ void THive::Handle(TEvPrivate::TEvBalancerOut::TPtr&) {
350350
BLOG_D("Handle BalancerOut");
351351
}
352352

353+
354+
void THive::Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev) {
355+
BLOG_D("Handle StartStorageBalancer");
356+
StartHiveStorageBalancer(std::move(ev->Get()->Settings));
357+
}
358+
353359
void THive::Handle(TEvHive::TEvBootTablet::TPtr& ev) {
354360
TTabletId tabletId = ev->Get()->Record.GetTabletID();
355361
TTabletInfo* tablet = FindTablet(tabletId);
@@ -2650,6 +2656,7 @@ TDuration THive::GetBalancerCooldown() const {
26502656
case EBalancerType::ScatterMemory:
26512657
case EBalancerType::ScatterNetwork:
26522658
case EBalancerType::SpreadNeighbours:
2659+
case EBalancerType::Storage:
26532660
return GetMinPeriodBetweenBalance();
26542661
case EBalancerType::Emergency:
26552662
return GetMinPeriodBetweenEmergencyBalance();
@@ -2860,6 +2867,7 @@ void THive::ProcessEvent(std::unique_ptr<IEventHandle> event) {
28602867
hFunc(TEvHive::TEvUpdateTabletsObject, Handle);
28612868
hFunc(TEvPrivate::TEvRefreshStorageInfo, Handle);
28622869
hFunc(TEvPrivate::TEvLogTabletMoves, Handle);
2870+
hFunc(TEvPrivate::TEvStartStorageBalancer, Handle);
28632871
hFunc(TEvHive::TEvUpdateDomain, Handle);
28642872
}
28652873
}
@@ -2958,6 +2966,7 @@ STFUNC(THive::StateWork) {
29582966
fFunc(TEvHive::TEvUpdateTabletsObject::EventType, EnqueueIncomingEvent);
29592967
fFunc(TEvPrivate::TEvRefreshStorageInfo::EventType, EnqueueIncomingEvent);
29602968
fFunc(TEvPrivate::TEvLogTabletMoves::EventType, EnqueueIncomingEvent);
2969+
fFunc(TEvPrivate::TEvStartStorageBalancer::EventType, EnqueueIncomingEvent);
29612970
fFunc(TEvHive::TEvUpdateDomain::EventType, EnqueueIncomingEvent);
29622971
hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle);
29632972
default:

ydb/core/mind/hive/hive_impl.h

+12
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
169169
friend class TQueryMigrationWaitActor;
170170
friend class TReleaseTabletsWaitActor;
171171
friend class TDrainNodeWaitActor;
172+
friend class THiveStorageBalancer;;
172173
friend struct TNodeInfo;
173174

174175
friend class TTxInitScheme;
@@ -204,6 +205,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
204205
friend class TTxMonEvent_QueryMigration;
205206
friend class TTxMonEvent_RebalanceFromScratch;
206207
friend class TTxMonEvent_ObjectStats;
208+
friend class TTxMonEvent_StorageRebalance;
207209
friend class TTxKillNode;
208210
friend class TTxLoadEverything;
209211
friend class TTxRestartTablet;
@@ -239,6 +241,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
239241
void StartHiveBalancer(TBalancerSettings&& settings);
240242
void StartHiveDrain(TNodeId nodeId, TDrainSettings settings);
241243
void StartHiveFill(TNodeId nodeId, const TActorId& initiator);
244+
void StartHiveStorageBalancer(TStorageBalancerSettings settings);
242245
void CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorContext& ctx);
243246
NJson::TJsonValue GetBalancerProgressJson();
244247
ITransaction* CreateDeleteTablet(TEvHive::TEvDeleteTablet::TPtr& ev);
@@ -551,6 +554,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
551554
void Handle(TEvHive::TEvUpdateTabletsObject::TPtr& ev);
552555
void Handle(TEvPrivate::TEvRefreshStorageInfo::TPtr& ev);
553556
void Handle(TEvPrivate::TEvLogTabletMoves::TPtr& ev);
557+
void Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev);
554558
void Handle(TEvPrivate::TEvProcessIncomingEvent::TPtr& ev);
555559
void Handle(TEvHive::TEvUpdateDomain::TPtr& ev);
556560

@@ -901,6 +905,14 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
901905
return CurrentConfig.GetBootStrategy();
902906
}
903907

908+
NKikimrConfig::THiveConfig::EHiveChannelBalanceStrategy GetChannelBalanceStrategy() const {
909+
return CurrentConfig.GetChannelBalanceStrategy();
910+
}
911+
912+
ui64 GetMaxChannelHistorySize() const {
913+
return CurrentConfig.GetMaxChannelHistorySize();
914+
}
915+
904916
static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
905917
static bool IsSystemTablet(TTabletTypes::EType type);
906918

ydb/core/mind/hive/hive_ut.cpp

+90
Original file line numberDiff line numberDiff line change
@@ -2780,6 +2780,96 @@ Y_UNIT_TEST_SUITE(THiveTest) {
27802780
UNIT_ASSERT_VALUES_EQUAL(getGroup(tabletId), goodGroup);
27812781
}
27822782

2783+
Y_UNIT_TEST(TestStorageBalancer) {
2784+
static constexpr ui64 NUM_TABLETS = 4;
2785+
TTestBasicRuntime runtime(1, false);
2786+
Setup(runtime, true, 2, [](TAppPrepare& app) {
2787+
app.HiveConfig.SetMinPeriodBetweenReassign(0);
2788+
});
2789+
const ui64 hiveTablet = MakeDefaultHiveID(0);
2790+
const ui64 testerTablet = MakeDefaultHiveID(1);
2791+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
2792+
2793+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
2794+
TVector<ui64> tablets;
2795+
for (ui64 i = 0; i < NUM_TABLETS; ++i) {
2796+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
2797+
ev->Record.SetObjectId(i);
2798+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
2799+
tablets.emplace_back(tabletId);
2800+
MakeSureTabletIsUp(runtime, tabletId, 0);
2801+
}
2802+
ui64 tabletBase = tablets.front();
2803+
2804+
TActorId sender = runtime.AllocateEdgeActor();
2805+
auto getGroup = [&runtime, sender, hiveTablet](ui64 tabletId) {
2806+
runtime.SendToPipe(hiveTablet, sender, new TEvHive::TEvRequestHiveInfo({
2807+
.TabletId = tabletId,
2808+
.ReturnChannelHistory = true,
2809+
}));
2810+
TAutoPtr<IEventHandle> handle;
2811+
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
2812+
2813+
const auto& tablet = response->Record.GetTablets().Get(0);
2814+
const auto& channel = tablet.GetTabletChannels().Get(0);
2815+
const auto& history = channel.GetHistory();
2816+
return history.Get(history.size() - 1).GetGroup();
2817+
};
2818+
2819+
std::unordered_map<ui64, std::vector<ui64>> groupToTablets;
2820+
for (auto tablet : tablets) {
2821+
groupToTablets[getGroup(tablet)].push_back(tablet);
2822+
}
2823+
ui64 tabletA;
2824+
ui64 tabletB;
2825+
for (const auto& [group, tablets] : groupToTablets) {
2826+
if (tablets.size() >= 2) {
2827+
tabletA = tablets[0];
2828+
tabletB = tablets[1];
2829+
}
2830+
}
2831+
2832+
// If assured space is not set, usage is always set to 1
2833+
auto groupMetricsExchange = MakeHolder<TEvBlobStorage::TEvControllerGroupMetricsExchange>();
2834+
for (const auto& [group, tablets] : groupToTablets) {
2835+
NKikimrBlobStorage::TGroupMetrics* metrics = groupMetricsExchange->Record.AddGroupMetrics();
2836+
2837+
metrics->SetGroupId(group);
2838+
metrics->MutableGroupParameters()->SetGroupID(group);
2839+
metrics->MutableGroupParameters()->SetStoragePoolName("def1");
2840+
metrics->MutableGroupParameters()->MutableAssuredResources()->SetSpace(300'000'000);
2841+
}
2842+
2843+
runtime.SendToPipe(MakeBSControllerID(0), sender, groupMetricsExchange.Release(), 0, GetPipeConfigWithRetries());
2844+
{
2845+
TDispatchOptions options;
2846+
options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerGroupMetricsExchange));
2847+
runtime.DispatchEvents(options);
2848+
}
2849+
2850+
TChannelsBindings channels = BINDED_CHANNELS;
2851+
for (auto& bind : channels) {
2852+
bind.SetSize(200'000'000);
2853+
}
2854+
for (auto tablet : {tabletA, tabletB}) {
2855+
TAutoPtr<TEvHive::TEvCreateTablet> updateTablet(new TEvHive::TEvCreateTablet(testerTablet, 100500 + (tablet - tabletBase), tabletType, channels));
2856+
SendCreateTestTablet(runtime, hiveTablet, testerTablet, updateTablet, 0, true);
2857+
}
2858+
runtime.SendToPipe(hiveTablet, sender, new NHive::TEvPrivate::TEvStartStorageBalancer({
2859+
.NumReassigns = 100,
2860+
.MaxInFlight = 1,
2861+
.StoragePool = "def1",
2862+
}));
2863+
2864+
{
2865+
TDispatchOptions options;
2866+
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvRestartComplete, 4); // should actually be less than 4
2867+
runtime.DispatchEvents(options, TDuration::Seconds(10));
2868+
}
2869+
2870+
UNIT_ASSERT_VALUES_UNEQUAL(getGroup(tabletA), getGroup(tabletB));
2871+
}
2872+
27832873
// Y_UNIT_TEST(TestCreateTabletAndChangeProfiles) {
27842874
// TTestBasicRuntime runtime(1, false);
27852875
// Setup(runtime, true);

ydb/core/mind/hive/leader_tablet_info.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,12 @@ const NKikimrBlobStorage::TEvControllerSelectGroupsResult::TGroupParameters* TLe
261261
});
262262
break;
263263
}
264+
case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_BALANCE: {
265+
return storagePool->FindFreeAllocationUnit([&params](const TStorageGroupInfo& newGroup) -> bool {
266+
return newGroup.IsMatchesParameters(*params);
267+
});
268+
break;
269+
}
264270
case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_SPACE: {
265271
NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy balanceStrategy = Hive.CurrentConfig.GetStorageBalanceStrategy();
266272
Hive.CurrentConfig.SetStorageBalanceStrategy(NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_SIZE);

ydb/core/mind/hive/leader_tablet_info.h

+27
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,25 @@ struct TLeaderTabletInfo : TTabletInfo {
2626
static TString DEFAULT_STORAGE_POOL_NAME;
2727

2828
public:
29+
struct TChannel {
30+
TTabletId TabletId;
31+
ui32 ChannelId;
32+
const TChannelBind* ChannelInfo;
33+
34+
double GetWeight(NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) const {
35+
Y_DEBUG_ABORT_UNLESS(ChannelInfo);
36+
switch (metricToBalance) {
37+
case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_IOPS:
38+
return ChannelInfo->GetIOPS();
39+
case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_THROUGHPUT:
40+
return ChannelInfo->GetThroughput();
41+
default:
42+
case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_SIZE:
43+
return ChannelInfo->GetSize();
44+
}
45+
}
46+
};
47+
2948
TTabletId Id;
3049
ETabletState State;
3150
TTabletTypes::EType Type;
@@ -298,6 +317,14 @@ struct TLeaderTabletInfo : TTabletInfo {
298317
return BoundChannels.size();
299318
}
300319

320+
TChannel GetChannel(ui32 channelId) const {
321+
TChannel channel{.TabletId = Id, .ChannelId = channelId, .ChannelInfo = nullptr};
322+
if (channelId < BoundChannels.size()) {
323+
channel.ChannelInfo = &BoundChannels[channelId];
324+
}
325+
return channel;
326+
}
327+
301328
void AcquireAllocationUnits();
302329
void ReleaseAllocationUnits();
303330
bool AcquireAllocationUnit(ui32 channelId);

ydb/core/mind/hive/monitoring.cpp

+31-1
Original file line numberDiff line numberDiff line change
@@ -1424,7 +1424,8 @@ class TTxMonEvent_Landing : public TTransactionBase<THive> {
14241424
EBalancerType::Emergency,
14251425
EBalancerType::SpreadNeighbours,
14261426
EBalancerType::Scatter,
1427-
EBalancerType::Manual
1427+
EBalancerType::Manual,
1428+
EBalancerType::Storage,
14281429
}) {
14291430
int balancer = static_cast<int>(type);
14301431
out << "<tr id='balancer" << balancer << "'><td>" << EBalancerTypeName(type) << "</td><td></td><td></td><td></td><td></td><td></td></tr>";
@@ -2505,6 +2506,32 @@ class TTxMonEvent_Rebalance : public TTransactionBase<THive> {
25052506
}
25062507
};
25072508

2509+
class TTxMonEvent_StorageRebalance : public TTransactionBase<THive> {
2510+
public:
2511+
const TActorId Source;
2512+
TStorageBalancerSettings Settings;
2513+
2514+
TTxMonEvent_StorageRebalance(const TActorId& source, NMon::TEvRemoteHttpInfo::TPtr& ev, TSelf* hive)
2515+
: TBase(hive)
2516+
, Source(source)
2517+
{
2518+
Settings.NumReassigns = FromStringWithDefault(ev->Get()->Cgi().Get("reassigns"), 1000);
2519+
Settings.MaxInFlight = FromStringWithDefault(ev->Get()->Cgi().Get("inflight"), 1);
2520+
Settings.StoragePool = ev->Get()->Cgi().Get("pool");
2521+
}
2522+
2523+
TTxType GetTxType() const override { return NHive::TXTYPE_MON_REBALANCE; }
2524+
2525+
bool Execute(TTransactionContext&, const TActorContext&) override {
2526+
Self->StartHiveStorageBalancer(Settings);
2527+
return true;
2528+
}
2529+
2530+
void Complete(const TActorContext& ctx) override {
2531+
ctx.Send(Source, new NMon::TEvRemoteJsonInfoRes("{}"));
2532+
}
2533+
};
2534+
25082535
class TTxMonEvent_RebalanceFromScratch : public TTransactionBase<THive> {
25092536
public:
25102537
const TActorId Source;
@@ -4072,6 +4099,9 @@ void THive::CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorCo
40724099
if (page == "Storage") {
40734100
return Execute(new TTxMonEvent_Storage(ev->Sender, ev, this), ctx);
40744101
}
4102+
if (page == "StorageRebalance") {
4103+
return Execute(new TTxMonEvent_StorageRebalance(ev->Sender, ev, this), ctx);
4104+
}
40754105
return Execute(new TTxMonEvent_Landing(ev->Sender, ev, this), ctx);
40764106
}
40774107

0 commit comments

Comments
 (0)