Skip to content

Commit cc619e9

Browse files
authored
Merge 8734f79 into ee02567
2 parents ee02567 + 8734f79 commit cc619e9

File tree

5 files changed

+78
-1
lines changed

5 files changed

+78
-1
lines changed

ydb/core/mind/hive/hive_impl.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -2329,7 +2329,8 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
23292329
nodeUsageHistogram.IncrementFor(record.Usage * 100);
23302330
}
23312331

2332-
if (stats.MaxUsage >= GetMaxNodeUsageToKick()) {
2332+
double minUsageToKick = GetMaxNodeUsageToKick() - GetNodeUsageDiffToEmergencyBalance();
2333+
if (stats.MaxUsage >= GetMaxNodeUsageToKick() && stats.MinUsage < minUsageToKick) {
23332334
std::vector<TNodeId> overloadedNodes;
23342335
for (const auto& [nodeId, nodeInfo] : Nodes) {
23352336
if (nodeInfo.IsAlive() && !nodeInfo.Down && nodeInfo.IsOverloaded()) {

ydb/core/mind/hive/hive_impl.h

+4
Original file line numberDiff line numberDiff line change
@@ -934,6 +934,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
934934
return CurrentConfig.GetStorageBalancerInflight();
935935
}
936936

937+
double GetNodeUsageDiffToEmergencyBalance() const {
938+
return CurrentConfig.GetNodeUsageDiffToEmergencyBalance();
939+
}
940+
937941
static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
938942
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
939943
static bool IsSystemTablet(TTabletTypes::EType type);

ydb/core/mind/hive/hive_ut.cpp

+69
Original file line numberDiff line numberDiff line change
@@ -4968,6 +4968,75 @@ Y_UNIT_TEST_SUITE(THiveTest) {
49684968
UNIT_ASSERT_VALUES_EQUAL(newDistribution[1].size(), TABLETS_PER_NODE - 1);
49694969
}
49704970

4971+
Y_UNIT_TEST(TestHiveBalancerHighUsage) {
4972+
static constexpr ui64 NUM_NODES = 2;
4973+
TTestBasicRuntime runtime(2, false);
4974+
Setup(runtime, true, 1, [](TAppPrepare& app) {
4975+
app.HiveConfig.SetTabletKickCooldownPeriod(0);
4976+
app.HiveConfig.SetResourceChangeReactionPeriod(0);
4977+
});
4978+
const int nodeBase = runtime.GetNodeId(0);
4979+
TActorId senderA = runtime.AllocateEdgeActor();
4980+
const ui64 hiveTablet = MakeDefaultHiveID();
4981+
const ui64 testerTablet = MakeTabletID(false, 1);
4982+
4983+
auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array<std::vector<ui64>, NUM_NODES> {
4984+
std::array<std::vector<ui64>, NUM_NODES> nodeTablets = {};
4985+
{
4986+
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
4987+
TAutoPtr<IEventHandle> handle;
4988+
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
4989+
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
4990+
UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES),
4991+
"nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
4992+
nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID());
4993+
}
4994+
}
4995+
return nodeTablets;
4996+
};
4997+
4998+
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
4999+
5000+
// wait for creation of nodes
5001+
{
5002+
TDispatchOptions options;
5003+
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
5004+
runtime.DispatchEvents(options);
5005+
}
5006+
5007+
TTabletTypes::EType tabletType = TTabletTypes::Dummy;
5008+
for (size_t i = 0; i < 2; ++i) {
5009+
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
5010+
ev->Record.SetObjectId(i);
5011+
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
5012+
MakeSureTabletIsUp(runtime, tabletId, 0);
5013+
}
5014+
5015+
auto initialDistribution = getDistribution();
5016+
5017+
std::array<double, NUM_NODES> usages = {.89, .91};
5018+
for (ui32 i = 0; i < 2; ++i) {
5019+
for (ui32 node = 0; node < NUM_NODES; ++node) {
5020+
TActorId sender = runtime.AllocateEdgeActor(node);
5021+
THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>();
5022+
metrics->Record.SetTotalNodeUsage(usages[node]);
5023+
5024+
runtime.SendToPipe(hiveTablet, sender, metrics.Release(), node);
5025+
}
5026+
}
5027+
5028+
{
5029+
TDispatchOptions options;
5030+
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut);
5031+
runtime.DispatchEvents(options, TDuration::Seconds(10));
5032+
}
5033+
5034+
// Check that balancer moved no tablets
5035+
auto newDistribution = getDistribution();
5036+
5037+
UNIT_ASSERT_EQUAL(initialDistribution, newDistribution);
5038+
}
5039+
49715040
Y_UNIT_TEST(TestUpdateTabletsObjectUpdatesMetrics) {
49725041
TTestBasicRuntime runtime(1, false);
49735042
Setup(runtime, true);

ydb/core/mind/hive/monitoring.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
794794
UpdateConfig(db, "MinNetworkScatterToBalance", configUpdates);
795795
UpdateConfig(db, "MinCounterScatterToBalance", configUpdates);
796796
UpdateConfig(db, "MaxNodeUsageToKick", configUpdates, TSchemeIds::State::MaxNodeUsageToKick);
797+
UpdateConfig(db, "NodeUsageDiffToEmergencyBalance", configUpdates);
797798
UpdateConfig(db, "ResourceChangeReactionPeriod", configUpdates, TSchemeIds::State::ResourceChangeReactionPeriod);
798799
UpdateConfig(db, "TabletKickCooldownPeriod", configUpdates, TSchemeIds::State::TabletKickCooldownPeriod);
799800
UpdateConfig(db, "SpreadNeighbours", configUpdates, TSchemeIds::State::SpreadNeighbours);
@@ -1140,6 +1141,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
11401141
ShowConfig(out, "MinCounterScatterToBalance");
11411142
ShowConfig(out, "MinNodeUsageToBalance");
11421143
ShowConfig(out, "MaxNodeUsageToKick");
1144+
ShowConfig(out, "NodeUsageDiffToEmergencyBalance");
11431145
ShowConfig(out, "ResourceChangeReactionPeriod");
11441146
ShowConfig(out, "TabletKickCooldownPeriod");
11451147
ShowConfig(out, "NodeSelectStrategy");

ydb/core/protos/config.proto

+1
Original file line numberDiff line numberDiff line change
@@ -1469,6 +1469,7 @@ message THiveConfig {
14691469
optional double MinGroupUsageToBalance = 72 [default = 0.1];
14701470
optional uint64 StorageBalancerInflight = 73 [default = 1];
14711471
optional bool EnableDestroyOperations = 74 [default = false];
1472+
optional double MinNodeUsageToNotBalance = 75 [default = 0.7];
14721473
}
14731474

14741475
message TBlobCacheConfig {

0 commit comments

Comments
 (0)