Skip to content

Commit 42e57bf

Browse files
authored
Merge da469a0 into 14c79b8
2 parents 14c79b8 + da469a0 commit 42e57bf

File tree

5 files changed

+79
-1
lines changed

5 files changed

+79
-1
lines changed

ydb/core/mind/hive/hive_impl.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -2327,7 +2327,8 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
23272327
nodeUsageHistogram.IncrementFor(record.Usage * 100);
23282328
}
23292329

2330-
if (stats.MaxUsage >= GetMaxNodeUsageToKick()) {
2330+
double minUsageToKick = GetMaxNodeUsageToKick() - GetNodeUsageRangeToKick();
2331+
if (stats.MaxUsage >= GetMaxNodeUsageToKick() && stats.MinUsage < minUsageToKick) {
23312332
std::vector<TNodeId> overloadedNodes;
23322333
for (const auto& [nodeId, nodeInfo] : Nodes) {
23332334
if (nodeInfo.IsAlive() && !nodeInfo.Down && nodeInfo.IsOverloaded()) {

ydb/core/mind/hive/hive_impl.h

+4
Original file line numberDiff line numberDiff line change
@@ -934,6 +934,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
934934
return CurrentConfig.GetStorageBalancerInflight();
935935
}
936936

937+
double GetNodeUsageRangeToKick() const {
938+
return CurrentConfig.GetNodeUsageRangeToKick();
939+
}
940+
937941
static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
938942
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
939943
static bool IsSystemTablet(TTabletTypes::EType type);

ydb/core/mind/hive/hive_ut.cpp

+70
Original file line numberDiff line numberDiff line change
@@ -3961,6 +3961,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
39613961
// this value of MaxNodeUsageToKick is selected specifically to make test scenario work
39623962
// in link with number of tablets and values of network usage metrics used below
39633963
app.HiveConfig.SetMaxNodeUsageToKick(0.01);
3964+
app.HiveConfig.SetNodeUsageRangeToKick(0);
39643965
app.HiveConfig.SetEmergencyBalancerInflight(1); // to ensure fair distribution
39653966
});
39663967

@@ -4855,6 +4856,75 @@ Y_UNIT_TEST_SUITE(THiveTest) {
48554856
UNIT_ASSERT_VALUES_EQUAL(newDistribution[1].size(), TABLETS_PER_NODE - 1);
48564857
}
48574858

4859+
Y_UNIT_TEST(TestHiveBalancerHighUsage) {
4860+
static constexpr ui64 NUM_NODES = 2;
4861+
TTestBasicRuntime runtime(2, false);
4862+
Setup(runtime, true, 1, [](TAppPrepare& app) {
4863+
app.HiveConfig.SetTabletKickCooldownPeriod(0);
4864+
app.HiveConfig.SetResourceChangeReactionPeriod(0);
4865+
});
4866+
const int nodeBase = runtime.GetNodeId(0);
4867+
TActorId senderA = runtime.AllocateEdgeActor();
4868+
const ui64 hiveTablet = MakeDefaultHiveID();
4869+
const ui64 testerTablet = MakeTabletID(false, 1);
4870+
4871+
auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array<std::vector<ui64>, NUM_NODES> {
4872+
std::array<std::vector<ui64>, NUM_NODES> nodeTablets = {};
4873+
{
4874+
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
4875+
TAutoPtr<IEventHandle> handle;
4876+
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
4877+
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
4878+
UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES),
4879+
"nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
4880+
nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID());
4881+
}
4882+
}
4883+
return nodeTablets;
4884+
};
4885+
4886+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
4887+
4888+
// wait for creation of nodes
4889+
{
4890+
TDispatchOptions options;
4891+
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
4892+
runtime.DispatchEvents(options);
4893+
}
4894+
4895+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
4896+
for (size_t i = 0; i < 2; ++i) {
4897+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
4898+
ev->Record.SetObjectId(i);
4899+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
4900+
MakeSureTabletIsUp(runtime, tabletId, 0);
4901+
}
4902+
4903+
auto initialDistribution = getDistribution();
4904+
4905+
std::array<double, NUM_NODES> usages = {.89, .91};
4906+
for (ui32 i = 0; i < 2; ++i) {
4907+
for (ui32 node = 0; node < NUM_NODES; ++node) {
4908+
TActorId sender = runtime.AllocateEdgeActor(node);
4909+
THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>();
4910+
metrics->Record.SetTotalNodeUsage(usages[node]);
4911+
4912+
runtime.SendToPipe(hiveTablet, sender, metrics.Release(), node);
4913+
}
4914+
}
4915+
4916+
{
4917+
TDispatchOptions options;
4918+
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut);
4919+
runtime.DispatchEvents(options, TDuration::Seconds(10));
4920+
}
4921+
4922+
// Check that balancer moved no tablets
4923+
auto newDistribution = getDistribution();
4924+
4925+
UNIT_ASSERT_EQUAL(initialDistribution, newDistribution);
4926+
}
4927+
48584928
Y_UNIT_TEST(TestUpdateTabletsObjectUpdatesMetrics) {
48594929
TTestBasicRuntime runtime(1, false);
48604930
Setup(runtime, true);

ydb/core/mind/hive/monitoring.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
794794
UpdateConfig(db, "MinNetworkScatterToBalance", configUpdates);
795795
UpdateConfig(db, "MinCounterScatterToBalance", configUpdates);
796796
UpdateConfig(db, "MaxNodeUsageToKick", configUpdates, TSchemeIds::State::MaxNodeUsageToKick);
797+
UpdateConfig(db, "NodeUsageRangeToKick", configUpdates);
797798
UpdateConfig(db, "ResourceChangeReactionPeriod", configUpdates, TSchemeIds::State::ResourceChangeReactionPeriod);
798799
UpdateConfig(db, "TabletKickCooldownPeriod", configUpdates, TSchemeIds::State::TabletKickCooldownPeriod);
799800
UpdateConfig(db, "SpreadNeighbours", configUpdates, TSchemeIds::State::SpreadNeighbours);
@@ -1140,6 +1141,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
11401141
ShowConfig(out, "MinCounterScatterToBalance");
11411142
ShowConfig(out, "MinNodeUsageToBalance");
11421143
ShowConfig(out, "MaxNodeUsageToKick");
1144+
ShowConfig(out, "NodeUsageRangeToKick");
11431145
ShowConfig(out, "ResourceChangeReactionPeriod");
11441146
ShowConfig(out, "TabletKickCooldownPeriod");
11451147
ShowConfig(out, "NodeSelectStrategy");

ydb/core/protos/config.proto

+1
Original file line numberDiff line numberDiff line change
@@ -1469,6 +1469,7 @@ message THiveConfig {
14691469
optional double MinGroupUsageToBalance = 72 [default = 0.1];
14701470
optional uint64 StorageBalancerInflight = 73 [default = 1];
14711471
optional bool EnableDestroyOperations = 74 [default = false];
1472+
optional double NodeUsageRangeToKick = 75 [default = 0.2];
14721473
}
14731474

14741475
message TBlobCacheConfig {

0 commit comments

Comments
 (0)