Skip to content

Commit a3e9070

Browse files
pixccadameat
authored andcommitted
Enhance monitoring for serverless tenants in Hive KIKIMR-19289 (ydb-platform#710)
* Enhance monitoring for serverless tenants in Hive KIKIMR-19289
1 parent 38a85bb commit a3e9070

File tree

7 files changed

+67
-3
lines changed

7 files changed

+67
-3
lines changed

ydb/core/mind/hive/domain_info.h

+4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ struct TDomainInfo {
1515
TTabletId HiveId = 0;
1616
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
1717

18+
ui64 TabletsTotal = 0;
19+
ui64 TabletsAlive = 0;
20+
ui64 TabletsAliveInObjectDomain = 0;
21+
1822
ENodeSelectionPolicy GetNodeSelectionPolicy() const;
1923
};
2024

ydb/core/mind/hive/hive_impl.cpp

+18-1
Original file line numberDiff line numberDiff line change
@@ -1550,7 +1550,9 @@ void THive::DeleteTablet(TTabletId tabletId) {
15501550
}
15511551
Y_ENSURE_LOG(nt->second.LockedTablets.count(&tablet) == 0, " Deleting tablet found on node " << nt->first << " in locked set");
15521552
}
1553-
UpdateCounterTabletsTotal(-1 - (tablet.Followers.size()));
1553+
const i64 tabletsTotalDiff = -1 - (tablet.Followers.size());
1554+
UpdateCounterTabletsTotal(tabletsTotalDiff);
1555+
UpdateDomainTabletsTotal(tablet.ObjectDomain, tabletsTotalDiff);
15541556
Tablets.erase(it);
15551557
}
15561558
}
@@ -1583,6 +1585,21 @@ void THive::KillNode(TNodeId nodeId, const TActorId& local) {
15831585
Execute(CreateKillNode(nodeId, local));
15841586
}
15851587

1588+
void THive::UpdateDomainTabletsTotal(const TSubDomainKey& objectDomain, i64 tabletsTotalDiff) {
1589+
if (objectDomain) {
1590+
Domains[objectDomain].TabletsTotal += tabletsTotalDiff;
1591+
}
1592+
}
1593+
1594+
void THive::UpdateDomainTabletsAlive(const TSubDomainKey& objectDomain, i64 tabletsAliveDiff, const TSubDomainKey& tabletNodeDomain) {
1595+
if (objectDomain) {
1596+
Domains[objectDomain].TabletsAlive += tabletsAliveDiff;
1597+
if (tabletNodeDomain == objectDomain) {
1598+
Domains[objectDomain].TabletsAliveInObjectDomain += tabletsAliveDiff;
1599+
}
1600+
}
1601+
}
1602+
15861603
void THive::SetCounterTabletsTotal(ui64 tabletsTotal) {
15871604
if (TabletCounters != nullptr) {
15881605
auto& counter = TabletCounters->Simple()[NHive::COUNTER_TABLETS_TOTAL];

ydb/core/mind/hive/hive_impl.h

+2
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,8 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
633633
TTabletCategoryInfo& GetTabletCategory(TTabletCategoryId tabletCategoryId);
634634
void KillNode(TNodeId nodeId, const TActorId& local);
635635
void AddToBootQueue(TTabletInfo* tablet);
636+
void UpdateDomainTabletsTotal(const TSubDomainKey& objectDomain, i64 tabletsTotalDiff);
637+
void UpdateDomainTabletsAlive(const TSubDomainKey& objectDomain, i64 tabletsAliveDiff, const TSubDomainKey& tabletNodeDomain);
636638
void SetCounterTabletsTotal(ui64 tabletsTotal);
637639
void UpdateCounterTabletsTotal(i64 tabletsTotalDiff);
638640
void UpdateCounterTabletsAlive(i64 tabletsAliveDiff);

ydb/core/mind/hive/leader_tablet_info.cpp

+19
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ TFollowerId TLeaderTabletInfo::GetFollowerPromotableOnNode(TNodeId nodeId) const
6363
}
6464

6565
void TLeaderTabletInfo::AssignDomains(const TSubDomainKey& objectDomain, const TVector<TSubDomainKey>& allowedDomains) {
66+
const TSubDomainKey oldObjectDomain = ObjectDomain;
67+
6668
if (!allowedDomains.empty()) {
6769
NodeFilter.AllowedDomains = allowedDomains;
6870
if (!objectDomain) {
@@ -82,6 +84,22 @@ void TLeaderTabletInfo::AssignDomains(const TSubDomainKey& objectDomain, const T
8284
followerGroup.NodeFilter.AllowedDomains = NodeFilter.AllowedDomains;
8385
followerGroup.NodeFilter.ObjectDomain = NodeFilter.ObjectDomain;
8486
}
87+
88+
const ui64 leaderAndFollowers = 1 + Followers.size();
89+
Hive.UpdateDomainTabletsTotal(oldObjectDomain, -leaderAndFollowers);
90+
Hive.UpdateDomainTabletsTotal(ObjectDomain, +leaderAndFollowers);
91+
92+
if (IsAlive()) {
93+
Hive.UpdateDomainTabletsAlive(oldObjectDomain, -1, Node->GetServicedDomain());
94+
Hive.UpdateDomainTabletsAlive(ObjectDomain, +1, Node->GetServicedDomain());
95+
}
96+
97+
for (const auto& follower : Followers) {
98+
if (follower.IsAlive()) {
99+
Hive.UpdateDomainTabletsAlive(oldObjectDomain, -1, follower.Node->GetServicedDomain());
100+
Hive.UpdateDomainTabletsAlive(ObjectDomain, +1, follower.Node->GetServicedDomain());
101+
}
102+
}
85103
}
86104

87105
bool TLeaderTabletInfo::InitiateAssignTabletGroups() {
@@ -127,6 +145,7 @@ TFollowerTabletInfo& TLeaderTabletInfo::AddFollower(TFollowerGroup& followerGrou
127145
follower.Id = followerId;
128146
}
129147
Hive.UpdateCounterTabletsTotal(+1);
148+
Hive.UpdateDomainTabletsTotal(ObjectDomain, +1);
130149
return follower;
131150
}
132151

ydb/core/mind/hive/monitoring.cpp

+21-1
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,15 @@ class TTxMonEvent_MemStateDomains : public TTransactionBase<THive> {
459459
// out << "<script>$('.container').css('width', 'auto');</script>";
460460
out << "<table class='table table-sortable'>";
461461
out << "<thead>";
462-
out << "<tr><th>TenantId</th><th>Name</th><th>Hive</th><th>Status</th></tr>";
462+
out << "<tr>";
463+
out << "<th>TenantId</th>";
464+
out << "<th>Name</th>";
465+
out << "<th>Hive</th>";
466+
out << "<th>Status</th>";
467+
out << "<th>TabletsAliveInTenantDomain</th>";
468+
out << "<th>TabletsAliveInOtherDomains</th>";
469+
out << "<th>TabletsTotal</th>";
470+
out << "</tr>";
463471
out << "</thead>";
464472
out << "<tbody>";
465473
for (const auto& [domainKey, domainInfo] : Self->Domains) {
@@ -482,6 +490,18 @@ class TTxMonEvent_MemStateDomains : public TTransactionBase<THive> {
482490
out << "<td>-</td>";
483491
out << "<td>-</td>";
484492
}
493+
if (domainInfo.TabletsTotal > 0) {
494+
out << "<td>" << std::round(domainInfo.TabletsAliveInObjectDomain * 100.0 / domainInfo.TabletsTotal) << "%"
495+
<< " (" << domainInfo.TabletsAliveInObjectDomain << " of " << domainInfo.TabletsTotal << ")" << "</td>";
496+
497+
const ui64 tabletsAliveInOtherDomains = domainInfo.TabletsAlive - domainInfo.TabletsAliveInObjectDomain;
498+
out << "<td>" << std::round(tabletsAliveInOtherDomains * 100.0 / domainInfo.TabletsTotal) << "%"
499+
<< " (" << tabletsAliveInOtherDomains << " of " << domainInfo.TabletsTotal << ")" << "</td>";
500+
} else {
501+
out << "<td>-</td>";
502+
out << "<td>-</td>";
503+
}
504+
out << "<td>" << domainInfo.TabletsTotal << "</td>";
485505
out << "</tr>";
486506
}
487507
out << "</tbody>";

ydb/core/mind/hive/node_info.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV
6969
TabletsRunningByType[tablet->GetTabletType()].erase(tablet);
7070
TabletsOfObject[tablet->GetObjectId()].erase(tablet);
7171
Hive.UpdateCounterTabletsAlive(-1);
72+
Hive.UpdateDomainTabletsAlive(tablet->GetLeader().ObjectDomain, -1, GetServicedDomain());
7273
if (tablet->HasCounter() && tablet->IsLeader()) {
7374
Hive.UpdateObjectCount(tablet->AsLeader(), *this, -1);
7475
}
@@ -84,6 +85,7 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV
8485
TabletsRunningByType[tablet->GetTabletType()].emplace(tablet);
8586
TabletsOfObject[tablet->GetObjectId()].emplace(tablet);
8687
Hive.UpdateCounterTabletsAlive(+1);
88+
Hive.UpdateDomainTabletsAlive(tablet->GetLeader().ObjectDomain, +1, GetServicedDomain());
8789
if (tablet->HasCounter() && tablet->IsLeader()) {
8890
Hive.UpdateObjectCount(tablet->AsLeader(), *this, +1);
8991
}

ydb/core/mind/hive/tx__load_everything.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,7 @@ class TTxLoadEverything : public TTransactionBase<THive> {
710710

711711
void Complete(const TActorContext& ctx) override {
712712
BLOG_NOTICE("THive::TTxLoadEverything::Complete " << Self->DatabaseConfig.ShortDebugString());
713-
i64 tabletsTotal = 0;
713+
ui64 tabletsTotal = 0;
714714
for (auto it = Self->Tablets.begin(); it != Self->Tablets.end(); ++it) {
715715
++tabletsTotal;
716716
for (const TTabletInfo& follower : it->second.Followers) {

0 commit comments

Comments
 (0)