Skip to content

Commit 1705d00

Browse files
authored
Merge 239c8bd into d3f1c82
2 parents d3f1c82 + 239c8bd commit 1705d00

File tree

6 files changed

+47
-1
lines changed

6 files changed

+47
-1
lines changed

ydb/core/mind/hive/hive_impl.cpp

+32
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ void THive::Handle(TEvPrivate::TEvBootTablets::TPtr&) {
497497
for (auto* node : unimportantNodes) {
498498
node->Ping();
499499
}
500+
ProcessNodePingQueue();
500501
TVector<TTabletId> tabletsToReleaseFromParent;
501502
TSideEffects sideEffects;
502503
sideEffects.Reset(SelfId());
@@ -687,11 +688,13 @@ void THive::Cleanup() {
687688

688689
void THive::Handle(TEvLocal::TEvStatus::TPtr& ev) {
689690
BLOG_D("Handle TEvLocal::TEvStatus for Node " << ev->Sender.NodeId() << ": " << ev->Get()->Record.ShortDebugString());
691+
RemoveFromPingInProgress(ev->Sender.NodeId());
690692
Execute(CreateStatus(ev->Sender, ev->Get()->Record));
691693
}
692694

693695
void THive::Handle(TEvLocal::TEvSyncTablets::TPtr& ev) {
694696
BLOG_D("THive::Handle::TEvSyncTablets");
697+
RemoveFromPingInProgress(ev->Sender.NodeId());
695698
Execute(CreateSyncTablets(ev->Sender, ev->Get()->Record));
696699
}
697700

@@ -745,6 +748,7 @@ void THive::Handle(TEvInterconnect::TEvNodeConnected::TPtr &ev) {
745748
void THive::Handle(TEvInterconnect::TEvNodeDisconnected::TPtr &ev) {
746749
TNodeId nodeId = ev->Get()->NodeId;
747750
BLOG_W("Handle TEvInterconnect::TEvNodeDisconnected, NodeId " << nodeId);
751+
RemoveFromPingInProgress(nodeId);
748752
if (ConnectedNodes.erase(nodeId)) {
749753
UpdateCounterNodesConnected(-1);
750754
}
@@ -917,6 +921,7 @@ void THive::Handle(TEvents::TEvUndelivered::TPtr &ev) {
917921
case TEvLocal::EvPing: {
918922
TNodeId nodeId = ev->Cookie;
919923
TNodeInfo* node = FindNode(nodeId);
924+
NodePingsInProgress.erase(nodeId);
920925
if (node != nullptr && ev->Sender == node->Local) {
921926
if (node->IsDisconnecting()) {
922927
// ping continiousily until we fully disconnected from the node
@@ -925,6 +930,7 @@ void THive::Handle(TEvents::TEvUndelivered::TPtr &ev) {
925930
KillNode(node->Id, node->Local);
926931
}
927932
}
933+
ProcessNodePingQueue();
928934
break;
929935
}
930936
};
@@ -1696,6 +1702,13 @@ void THive::UpdateCounterTabletsStarting(i64 tabletsStartingDiff) {
16961702
}
16971703
}
16981704

1705+
void THive::UpdateCounterPingQueueSize() {
1706+
if (TabletCounters != nullptr) {
1707+
auto& counter = TabletCounters->Simple()[NHive::COUNTER_PINGQUEUE_SIZE];
1708+
counter.Set(NodePingQueue.size());
1709+
}
1710+
}
1711+
16991712
void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
17001713
TabletMoveHistory.PushBack(moveInfo);
17011714
TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);
@@ -2672,6 +2685,25 @@ void THive::ExecuteStartTablet(TFullTabletId tabletId, const TActorId& local, ui
26722685
Execute(CreateStartTablet(tabletId, local, cookie, external));
26732686
}
26742687

2688+
void THive::QueuePing(const TActorId& local) {
2689+
NodePingQueue.push(local);
2690+
}
2691+
2692+
void THive::ProcessNodePingQueue() {
2693+
while (!NodePingQueue.empty() && NodePingsInProgress.size() < GetMaxPingsInFlight()) {
2694+
TActorId local = NodePingQueue.front();
2695+
TNodeId node = local.NodeId();
2696+
NodePingQueue.pop();
2697+
NodePingsInProgress.insert(node);
2698+
SendPing(local, node);
2699+
}
2700+
}
2701+
2702+
void THive::RemoveFromPingInProgress(TNodeId node) {
2703+
NodePingsInProgress.erase(node);
2704+
ProcessNodePingQueue();
2705+
}
2706+
26752707
void THive::SendPing(const TActorId& local, TNodeId id) {
26762708
Send(local,
26772709
new TEvLocal::TEvPing(HiveId,

ydb/core/mind/hive/hive_impl.h

+10
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,8 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
415415
TEventPriorityQueue<THive> EventQueue{*this};
416416
ui64 OperationsLogIndex = 0;
417417
std::vector<TActorId> ActorsWaitingToMoveTablets;
418+
std::queue<TActorId> NodePingQueue;
419+
std::unordered_set<TNodeId> NodePingsInProgress;
418420

419421
struct TPendingCreateTablet {
420422
NKikimrHive::TEvCreateTablet CreateTablet;
@@ -650,6 +652,7 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
650652
void UpdateCounterEventQueueSize(i64 eventQueueSizeDiff);
651653
void UpdateCounterNodesConnected(i64 nodesConnectedDiff);
652654
void UpdateCounterTabletsStarting(i64 tabletsStartingDiff);
655+
void UpdateCounterPingQueueSize();
653656
void RecordTabletMove(const TTabletMoveInfo& info);
654657
bool DomainHasNodes(const TSubDomainKey &domainKey) const;
655658
void ProcessBootQueue();
@@ -678,7 +681,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
678681
void UpdateRegisteredDataCenters();
679682
void AddRegisteredDataCentersNode(TDataCenterId dataCenterId, TNodeId nodeId);
680683
void RemoveRegisteredDataCentersNode(TDataCenterId dataCenterId, TNodeId nodeId);
684+
void QueuePing(const TActorId& local);
681685
void SendPing(const TActorId& local, TNodeId id);
686+
void RemoveFromPingInProgress(TNodeId node);
687+
void ProcessNodePingQueue();
682688
void SendReconnect(const TActorId& local);
683689
static THolder<TGroupFilter> BuildGroupParametersForChannel(const TLeaderTabletInfo& tablet, ui32 channelId);
684690
void KickTablet(const TTabletInfo& tablet);
@@ -943,6 +949,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
943949
return CurrentConfig.GetLessSystemTabletsMoves();
944950
}
945951

952+
ui64 GetMaxPingsInFlight() const {
953+
return CurrentConfig.GetMaxPingsInFlight();
954+
}
955+
946956
static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
947957
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
948958
static bool IsSystemTablet(TTabletTypes::EType type);

ydb/core/mind/hive/node_info.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ void TNodeInfo::DeregisterInDomains() {
356356
void TNodeInfo::Ping() {
357357
Y_ABORT_UNLESS((bool)Local);
358358
BLOG_D("Node(" << Id << ") Ping(" << Local << ")");
359-
Hive.SendPing(Local, Id);
359+
Hive.QueuePing(Local);
360360
}
361361

362362
void TNodeInfo::SendReconnect(const TActorId& local) {

ydb/core/mind/hive/tx__register_node.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,9 @@ class TTxRegisterNode : public TTransactionBase<THive> {
8787
BLOG_D("THive::TTxRegisterNode(" << Local.NodeId() << ")::Complete");
8888
TNodeInfo* node = Self->FindNode(Local.NodeId());
8989
if (node != nullptr && node->Local) { // we send ping on every RegisterNode because we want to re-sync tablets upon every reconnection
90+
Self->NodePingsInProgress.erase(node->Id);
9091
node->Ping();
92+
Self->ProcessNodePingQueue();
9193
}
9294
}
9395
};

ydb/core/protos/config.proto

+1
Original file line numberDiff line numberDiff line change
@@ -1472,6 +1472,7 @@ message THiveConfig {
14721472
optional bool EnableDestroyOperations = 74 [default = false];
14731473
optional double NodeUsageRangeToKick = 75 [default = 0.2];
14741474
optional bool LessSystemTabletsMoves = 77 [default = true];
1475+
optional uint64 MaxPingsInFlight = 78 [default = 1000];
14751476
}
14761477

14771478
message TBlobCacheConfig {

ydb/core/protos/counters_hive.proto

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ enum ESimpleCounters {
3030
COUNTER_WORST_OBJECT_VARIANCE = 20 [(CounterOpts) = {Name: "WorstObjectVariance"}];
3131
COUNTER_STORAGE_SCATTER = 21 [(CounterOpts) = {Name: "StorageScatter"}];
3232
COUNTER_TABLETS_STARTING = 22 [(CounterOpts) = {Name: "TabletsStarting"}];
33+
COUNTER_PINGQUEUE_SIZE = 23 [(CounterOpts) = {Name: "PingQueueSize"}];
3334
}
3435

3536
enum ECumulativeCounters {

0 commit comments

Comments
 (0)