Skip to content

Commit 8a1ba16

Browse files
authored
tablets with disabled metrics ignore metric-related balancers (#4052)
1 parent 7b8da5c commit 8a1ba16

File tree

7 files changed

+53
-24
lines changed

7 files changed

+53
-24
lines changed

ydb/core/mind/hive/balancer.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,9 @@ class THiveBalancer : public NActors::TActorBootstrapped<THiveBalancer>, public
243243
std::vector<TTabletInfo*> tablets;
244244
tablets.reserve(nodeTablets.size());
245245
for (TTabletInfo* tablet : nodeTablets) {
246-
if (tablet->IsGoodForBalancer(now) && (!Settings.FilterObjectId || tablet->GetObjectId() == *Settings.FilterObjectId)) {
246+
if (tablet->IsGoodForBalancer(now) &&
247+
(!Settings.FilterObjectId || tablet->GetObjectId() == *Settings.FilterObjectId) &&
248+
tablet->HasAllowedMetric(Settings.ResourceToBalance)) {
247249
tablet->UpdateWeight();
248250
tablets.emplace_back(tablet);
249251
}

ydb/core/mind/hive/hive.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ constexpr std::size_t EBalancerTypeSize = static_cast<std::size_t>(EBalancerType
9898
TString EBalancerTypeName(EBalancerType value);
9999

100100
enum class EResourceToBalance {
101-
Dominant,
101+
ComputeResources,
102102
Counter,
103103
CPU,
104104
Memory,
@@ -296,7 +296,7 @@ struct TBalancerSettings {
296296
bool RecheckOnFinish = false;
297297
ui64 MaxInFlight = 1;
298298
const std::vector<TNodeId> FilterNodeIds = {};
299-
EResourceToBalance ResourceToBalance = EResourceToBalance::Dominant;
299+
EResourceToBalance ResourceToBalance = EResourceToBalance::ComputeResources;
300300
std::optional<TFullObjectId> FilterObjectId;
301301
};
302302

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2373,7 +2373,7 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
23732373
case EResourceToBalance::Network:
23742374
balancerType = EBalancerType::ScatterNetwork;
23752375
break;
2376-
case EResourceToBalance::Dominant:
2376+
case EResourceToBalance::ComputeResources:
23772377
balancerType = EBalancerType::Scatter;
23782378
break;
23792379
}

ydb/core/mind/hive/node_info.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {
222222

223223
bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState) const {
224224
if (tablet.IsAliveOnLocal(Local)) {
225-
return !IsOverloaded();
225+
return !(IsOverloaded() && tablet.HasAllowedMetric(EResourceToBalance::ComputeResources));
226226
}
227227
if (tablet.IsLeader()) {
228228
const TLeaderTabletInfo& leader = tablet.AsLeader();
@@ -280,7 +280,7 @@ bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState*
280280
}
281281
}
282282

283-
if (tablet.IsAlive() && IsOverloaded()) {
283+
if (tablet.IsAlive() && IsOverloaded() && tablet.HasAllowedMetric(EResourceToBalance::ComputeResources)) {
284284
// we don't move already running tablet to another overloaded node
285285
if (debugState) {
286286
debugState->NodesWithoutResources++;
@@ -434,7 +434,7 @@ double TNodeInfo::GetNodeUsageForTablet(const TTabletInfo& tablet) const {
434434

435435
double TNodeInfo::GetNodeUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource) const {
436436
double usage = TTabletInfo::ExtractResourceUsage(normValues, resource);
437-
if (resource == EResourceToBalance::Dominant && AveragedNodeTotalUsage.IsValueStable()) {
437+
if (resource == EResourceToBalance::ComputeResources && AveragedNodeTotalUsage.IsValueStable()) {
438438
usage = std::max(usage, AveragedNodeTotalUsage.GetValue());
439439
}
440440
return usage;

ydb/core/mind/hive/node_info.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,9 @@ struct TNodeInfo {
243243
}
244244

245245
double GetNodeUsageForTablet(const TTabletInfo& tablet) const;
246-
double GetNodeUsage(EResourceToBalance resource = EResourceToBalance::Dominant) const;
246+
double GetNodeUsage(EResourceToBalance resource = EResourceToBalance::ComputeResources) const;
247247
double GetNodeUsage(const TResourceNormalizedValues& normValues,
248-
EResourceToBalance resource = EResourceToBalance::Dominant) const;
248+
EResourceToBalance resource = EResourceToBalance::ComputeResources) const;
249249

250250
ui64 GetTabletsRunningByType(TTabletTypes::EType tabletType) const;
251251

ydb/core/mind/hive/tablet_info.cpp

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -310,12 +310,37 @@ const TVector<i64>& TTabletInfo::GetTabletAllowedMetricIds() const {
310310
return Hive.GetTabletTypeAllowedMetricIds(GetLeader().Type);
311311
}
312312

313+
bool TTabletInfo::HasAllowedMetric(const TVector<i64>& allowedMetricIds, EResourceToBalance resource) {
314+
switch (resource) {
315+
case EResourceToBalance::ComputeResources: {
316+
auto isComputeMetric = [](i64 metricId) {
317+
return metricId == NKikimrTabletBase::TMetrics::kCPUFieldNumber ||
318+
metricId == NKikimrTabletBase::TMetrics::kMemoryFieldNumber ||
319+
metricId == NKikimrTabletBase::TMetrics::kNetworkFieldNumber;
320+
};
321+
return AnyOf(allowedMetricIds.begin(), allowedMetricIds.end(), isComputeMetric);
322+
}
323+
case EResourceToBalance::Counter:
324+
return true;
325+
case EResourceToBalance::CPU:
326+
return Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end();
327+
case EResourceToBalance::Memory:
328+
return Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end();
329+
case EResourceToBalance::Network:
330+
return Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end();
331+
}
332+
}
333+
334+
bool TTabletInfo::HasAllowedMetric(EResourceToBalance resource) const {
335+
return HasAllowedMetric(GetTabletAllowedMetricIds(), resource);
336+
}
337+
313338
void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics) {
314339
TInstant now = TActivationContext::Now();
315340
const TVector<i64>& allowedMetricIds(GetTabletAllowedMetricIds());
316341
auto before = ResourceValues;
317342
auto maximum = GetResourceMaximumValues();
318-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end()) {
343+
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU)) {
319344
if (metrics.HasCPU()) {
320345
if (metrics.GetCPU() > static_cast<ui64>(std::get<NMetrics::EResource::CPU>(maximum))) {
321346
BLOG_W("Ignoring too high CPU metric (" << metrics.GetCPU() << ") for tablet " << ToString());
@@ -325,7 +350,7 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics
325350
}
326351
}
327352
}
328-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end()) {
353+
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory)) {
329354
if (metrics.HasMemory()) {
330355
if (metrics.GetMemory() > static_cast<ui64>(std::get<NMetrics::EResource::Memory>(maximum))) {
331356
BLOG_W("Ignoring too high Memory metric (" << metrics.GetMemory() << ") for tablet " << ToString());
@@ -335,7 +360,7 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics
335360
}
336361
}
337362
}
338-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end()) {
363+
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network)) {
339364
if (metrics.HasNetwork()) {
340365
if (metrics.GetNetwork() > static_cast<ui64>(std::get<NMetrics::EResource::Network>(maximum))) {
341366
BLOG_W("Ignoring too high Network metric (" << metrics.GetNetwork() << ") for tablet " << ToString());
@@ -396,13 +421,13 @@ TResourceRawValues TTabletInfo::GetResourceMaximumValues() const {
396421
}
397422

398423
i64 TTabletInfo::GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds) {
399-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsCPU(metrics)) {
424+
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) && THive::IsValidMetricsCPU(metrics)) {
400425
return 0;
401426
}
402-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsMemory(metrics)) {
427+
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) && THive::IsValidMetricsMemory(metrics)) {
403428
return 0;
404429
}
405-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsNetwork(metrics)) {
430+
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) && THive::IsValidMetricsNetwork(metrics)) {
406431
return 0;
407432
}
408433
return 1;
@@ -414,13 +439,13 @@ void TTabletInfo::FilterRawValues(TResourceRawValues& values) const {
414439
if (metrics.GetCounter() == 0) {
415440
std::get<NMetrics::EResource::Counter>(values) = 0;
416441
}
417-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsCPU(metrics)) {
442+
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) || !THive::IsValidMetricsCPU(metrics)) {
418443
std::get<NMetrics::EResource::CPU>(values) = 0;
419444
}
420-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsMemory(metrics)) {
445+
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) || !THive::IsValidMetricsMemory(metrics)) {
421446
std::get<NMetrics::EResource::Memory>(values) = 0;
422447
}
423-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsNetwork(metrics)) {
448+
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) || !THive::IsValidMetricsNetwork(metrics)) {
424449
std::get<NMetrics::EResource::Network>(values) = 0;
425450
}
426451
}
@@ -431,13 +456,13 @@ void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const {
431456
if (metrics.GetCounter() == 0) {
432457
std::get<NMetrics::EResource::Counter>(values) = 0;
433458
}
434-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsCPU(metrics)) {
459+
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) || !THive::IsValidMetricsCPU(metrics)) {
435460
std::get<NMetrics::EResource::CPU>(values) = 0;
436461
}
437-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsMemory(metrics)) {
462+
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) || !THive::IsValidMetricsMemory(metrics)) {
438463
std::get<NMetrics::EResource::Memory>(values) = 0;
439464
}
440-
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsNetwork(metrics)) {
465+
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) || !THive::IsValidMetricsNetwork(metrics)) {
441466
std::get<NMetrics::EResource::Network>(values) = 0;
442467
}
443468
}

ydb/core/mind/hive/tablet_info.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,8 @@ struct TTabletInfo {
227227
void BecomeUnknown(TNodeInfo* node);
228228
bool Kick();
229229
const TVector<i64>& GetTabletAllowedMetricIds() const;
230+
static bool HasAllowedMetric(const TVector<i64>& allowedMetricIds, EResourceToBalance resource);
231+
bool HasAllowedMetric(EResourceToBalance resource) const;
230232

231233
void UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics);
232234
TResourceRawValues GetResourceCurrentValues() const;
@@ -237,18 +239,18 @@ struct TTabletInfo {
237239
void ActualizeCounter();
238240

239241
template <typename ResourcesType>
240-
static double GetUsage(const ResourcesType& current, const ResourcesType& maximum, EResourceToBalance resource = EResourceToBalance::Dominant) {
242+
static double GetUsage(const ResourcesType& current, const ResourcesType& maximum, EResourceToBalance resource = EResourceToBalance::ComputeResources) {
241243
auto normValues = NormalizeRawValues(current, maximum);
242244
return ExtractResourceUsage(normValues, resource);
243245
}
244246

245-
static double ExtractResourceUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource = EResourceToBalance::Dominant) {
247+
static double ExtractResourceUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource = EResourceToBalance::ComputeResources) {
246248
switch (resource) {
247249
case EResourceToBalance::CPU: return std::get<NMetrics::EResource::CPU>(normValues);
248250
case EResourceToBalance::Memory: return std::get<NMetrics::EResource::Memory>(normValues);
249251
case EResourceToBalance::Network: return std::get<NMetrics::EResource::Network>(normValues);
250252
case EResourceToBalance::Counter: return std::get<NMetrics::EResource::Counter>(normValues);
251-
case EResourceToBalance::Dominant: return max(normValues);
253+
case EResourceToBalance::ComputeResources: return max(normValues);
252254
}
253255
}
254256

0 commit comments

Comments
 (0)