From 893d5ae60e1f3d5a9ea6d007560490962e7e1efe Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Thu, 27 Feb 2025 12:23:33 +0000 Subject: [PATCH 1/2] handle follower dying while being promoted to leader --- ydb/core/mind/hive/hive_ut.cpp | 58 +++++++++++++++++++++++----------- ydb/core/mind/local.cpp | 30 +++++++++++++++--- 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index ae6823df3bcf..ea8b27b32d39 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -3363,7 +3363,7 @@ Y_UNIT_TEST_SUITE(THiveTest) { } } - Y_UNIT_TEST(TestFollowerPromotion) { + void TestFollowerPromotion(bool killDuringPromotion) { constexpr int NODES = 3; TTestBasicRuntime runtime(NODES, false); Setup(runtime, true); @@ -3394,30 +3394,42 @@ Y_UNIT_TEST_SUITE(THiveTest) { MakeSureTabletIsUp(runtime, tabletId, i, &pipeConfig, &tabletRolesBefore[i]); } int leaders = std::accumulate(tabletRolesBefore.begin(), tabletRolesBefore.end(), 0, [](int a, bool b) -> int { return b ? a + 1 : a; }); - UNIT_ASSERT_VALUES_EQUAL(leaders, 1); int leaderNode = std::find(tabletRolesBefore.begin(), tabletRolesBefore.end(), true) - tabletRolesBefore.begin(); - // killing leader - SendKillLocal(runtime, leaderNode); + UNIT_ASSERT_VALUES_EQUAL(leaders, 1); { - TDispatchOptions options; - options.FinalEvents.emplace_back(TEvLocal::EvTabletStatus); - runtime.DispatchEvents(options); - } - std::array tabletRolesIntermediate = {}; - for (int i = 0; i < NODES; ++i) { - if (i != leaderNode) { - MakeSureTabletIsUp(runtime, tabletId, i, &pipeConfig, &tabletRolesIntermediate[i]); - } else { - tabletRolesIntermediate[i] = false; + TBlockEvents blockPromote(runtime); + // killing leader + SendKillLocal(runtime, leaderNode); + + while (blockPromote.empty()) { + runtime.DispatchEvents({}, TDuration::MilliSeconds(100)); + } + + if (killDuringPromotion) { + for (int i = 0; i < NODES; ++i) { + if (i == leaderNode) { + continue; + } + TActorId sender = runtime.AllocateEdgeActor(i); + runtime.SendToPipe(tabletId, sender, new TEvents::TEvPoisonPill, i, pipeConfig); + } } + + runtime.DispatchEvents({}, TDuration::MilliSeconds(100)); + + blockPromote.Stop().Unblock(); + } + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvTabletStatus, killDuringPromotion ? 3 : 1); + runtime.DispatchEvents(options, TDuration::MilliSeconds(100)); } - leaders = std::accumulate(tabletRolesIntermediate.begin(), tabletRolesIntermediate.end(), 0, [](int a, bool b) -> int { return b ? a + 1 : a; }); - int followers = std::accumulate(tabletRolesIntermediate.begin(), tabletRolesIntermediate.end(), 0, [](int a, bool b) -> int { return b ? a : a + 1; }); - UNIT_ASSERT_VALUES_EQUAL(leaders, 1); - UNIT_ASSERT_VALUES_EQUAL(followers, 2); std::unordered_set> activeTablets; TActorId senderA = runtime.AllocateEdgeActor(); for (int i = 0; i < NODES; ++i) { + if (i == leaderNode) { + continue; + } TActorId whiteboard = NNodeWhiteboard::MakeNodeWhiteboardServiceId(runtime.GetNodeId(i)); runtime.Send(new IEventHandle(whiteboard, senderA, new NNodeWhiteboard::TEvWhiteboard::TEvTabletStateRequest())); TAutoPtr handle; @@ -3432,6 +3444,16 @@ Y_UNIT_TEST_SUITE(THiveTest) { } } UNIT_ASSERT_VALUES_EQUAL(activeTablets.size(), 3); + leaders = std::count_if(activeTablets.begin(), activeTablets.end(), [](auto&& p) { return p.second == 0; }); + UNIT_ASSERT_VALUES_EQUAL(leaders, 1); + } + + Y_UNIT_TEST(TestFollowerPromotion) { + TestFollowerPromotion(false); + } + + Y_UNIT_TEST(TestFollowerPromotionFollowerDies) { + TestFollowerPromotion(true); } Y_UNIT_TEST(TestManyFollowersOnOneNode) { diff --git a/ydb/core/mind/local.cpp b/ydb/core/mind/local.cpp index 3bffef054101..64fdfd2bcf8e 100644 --- a/ydb/core/mind/local.cpp +++ b/ydb/core/mind/local.cpp @@ -52,19 +52,19 @@ class TLocalNodeRegistrar : public TActorBootstrapped { ui32 Generation; TTabletTypes::EType TabletType; NKikimrLocal::EBootMode BootMode; - ui32 FollowerId; TTablet() : Tablet() , Generation(0) , TabletType() , BootMode(NKikimrLocal::EBootMode::BOOT_MODE_LEADER) - , FollowerId(0) {} }; struct TTabletEntry : TTablet { TInstant From; + bool IsPromoting = false; + ui32 PromotingFromFollower = 0; TTabletEntry() : From(TInstant::MicroSeconds(0)) @@ -141,6 +141,10 @@ class TLocalNodeRegistrar : public TActorBootstrapped { ::NMonitoring::TDynamicCounters::TCounterPtr CounterCancelDemotedByBS; ::NMonitoring::TDynamicCounters::TCounterPtr CounterCancelUnknownReason; + static TTabletId LeaderId(TTabletId tabletId) { + return {tabletId.first, 0}; + } + void Die(const TActorContext &ctx) override { if (HivePipeClient) { if (Connected) { @@ -430,15 +434,17 @@ class TLocalNodeRegistrar : public TActorBootstrapped { // promote to leader it->second.BootMode = NKikimrLocal::EBootMode::BOOT_MODE_LEADER; it->second.Generation = suggestedGen; - tabletId.second = 0; // FollowerId = 0 - TTabletEntry &entry = InbootTablets[tabletId]; + TTabletId leaderId = LeaderId(tabletId); + TTabletEntry &entry = InbootTablets[leaderId]; entry = it->second; entry.From = ctx.Now(); entry.BootMode = NKikimrLocal::EBootMode::BOOT_MODE_LEADER; entry.Generation = suggestedGen; + entry.IsPromoting = true; + entry.PromotingFromFollower = tabletId.second; ctx.Send(entry.Tablet, new TEvTablet::TEvPromoteToLeader(suggestedGen, info)); MarkDeadTablet(it->first, 0, TEvLocal::TEvTabletStatus::StatusSupersededByLeader, TEvTablet::TEvTabletDead::ReasonError, ctx); - OnlineTablets.erase(it); + it->second.IsPromoting = true; LOG_DEBUG_S(ctx, NKikimrServices::LOCAL, "TLocalNodeRegistrar::Handle TEvLocal::TEvBootTablet follower tablet " << tabletId << " promoted to leader"); return; @@ -718,6 +724,12 @@ class TLocalNodeRegistrar : public TActorBootstrapped { << " marked as running at generation " << generation); NTabletPipe::SendData(ctx, HivePipeClient, new TEvLocal::TEvTabletStatus(TEvLocal::TEvTabletStatus::StatusOk, tabletId, generation)); + if (inbootIt->second.IsPromoting) { + TTabletId promotedTablet{tabletId.first, inbootIt->second.PromotingFromFollower}; + OnlineTablets.erase(promotedTablet); + inbootIt->second.IsPromoting = false; + inbootIt->second.PromotingFromFollower = 0; + } OnlineTablets.emplace(tabletId, inbootIt->second); InbootTablets.erase(inbootIt); } @@ -818,6 +830,14 @@ class TLocalNodeRegistrar : public TActorBootstrapped { }); if (onlineIt != OnlineTablets.end()) { // from online list MarkDeadTablet(onlineIt->first, generation, TEvLocal::TEvTabletStatus::StatusFailed, msg->Reason, ctx); + if (onlineIt->second.IsPromoting) { + TTabletId leader = LeaderId(onlineIt->first); + auto inbootIt = InbootTablets.find(leader); + if (inbootIt != InbootTablets.end()) { + MarkDeadTablet(leader, inbootIt->second.Generation, TEvLocal::TEvTabletStatus::StatusFailed, msg->Reason, ctx); + } + InbootTablets.erase(inbootIt); + } OnlineTablets.erase(onlineIt); UpdateEstimate(); return; From e47265fdf7e9395aa3a9b90cdd69d96f95606c1e Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Mon, 3 Mar 2025 09:05:14 +0000 Subject: [PATCH 2/2] code review --- ydb/core/mind/local.cpp | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/ydb/core/mind/local.cpp b/ydb/core/mind/local.cpp index 64fdfd2bcf8e..a8242818bf8c 100644 --- a/ydb/core/mind/local.cpp +++ b/ydb/core/mind/local.cpp @@ -389,6 +389,24 @@ class TLocalNodeRegistrar : public TActorBootstrapped { ScheduleSendTabletMetrics(ctx); } + void StartPromotion(TTabletId tabletId, TOnlineTabletEntry& followerEntry, ui32 suggestedGen, TInstant now) { + TTabletId leaderId = LeaderId(tabletId); + TTabletEntry& leaderEntry = InbootTablets[leaderId]; + followerEntry.IsPromoting = true; + leaderEntry = followerEntry; + leaderEntry.From = now; + leaderEntry.BootMode = NKikimrLocal::EBootMode::BOOT_MODE_LEADER; + leaderEntry.Generation = suggestedGen; + leaderEntry.PromotingFromFollower = tabletId.second; + } + + void FinishPromotion(TTabletId tabletId, TTabletEntry& entry) { + TTabletId promotedTablet{tabletId.first, entry.PromotingFromFollower}; + OnlineTablets.erase(promotedTablet); + entry.IsPromoting = false; + entry.PromotingFromFollower = 0; + } + void Handle(TEvLocal::TEvBootTablet::TPtr &ev, const TActorContext &ctx) { NKikimrLocal::TEvBootTablet &record = ev->Get()->Record; TIntrusivePtr info(TabletStorageInfoFromProto(record.GetInfo())); @@ -431,20 +449,9 @@ class TLocalNodeRegistrar : public TActorBootstrapped { if (it != OnlineTablets.end()) { if (it->second.BootMode == NKikimrLocal::EBootMode::BOOT_MODE_FOLLOWER && record.GetBootMode() == NKikimrLocal::EBootMode::BOOT_MODE_LEADER) { - // promote to leader - it->second.BootMode = NKikimrLocal::EBootMode::BOOT_MODE_LEADER; - it->second.Generation = suggestedGen; - TTabletId leaderId = LeaderId(tabletId); - TTabletEntry &entry = InbootTablets[leaderId]; - entry = it->second; - entry.From = ctx.Now(); - entry.BootMode = NKikimrLocal::EBootMode::BOOT_MODE_LEADER; - entry.Generation = suggestedGen; - entry.IsPromoting = true; - entry.PromotingFromFollower = tabletId.second; - ctx.Send(entry.Tablet, new TEvTablet::TEvPromoteToLeader(suggestedGen, info)); + StartPromotion(tabletId, it->second, suggestedGen, ctx.Now()); + ctx.Send(it->second.Tablet, new TEvTablet::TEvPromoteToLeader(suggestedGen, info)); MarkDeadTablet(it->first, 0, TEvLocal::TEvTabletStatus::StatusSupersededByLeader, TEvTablet::TEvTabletDead::ReasonError, ctx); - it->second.IsPromoting = true; LOG_DEBUG_S(ctx, NKikimrServices::LOCAL, "TLocalNodeRegistrar::Handle TEvLocal::TEvBootTablet follower tablet " << tabletId << " promoted to leader"); return; @@ -725,10 +732,7 @@ class TLocalNodeRegistrar : public TActorBootstrapped { << generation); NTabletPipe::SendData(ctx, HivePipeClient, new TEvLocal::TEvTabletStatus(TEvLocal::TEvTabletStatus::StatusOk, tabletId, generation)); if (inbootIt->second.IsPromoting) { - TTabletId promotedTablet{tabletId.first, inbootIt->second.PromotingFromFollower}; - OnlineTablets.erase(promotedTablet); - inbootIt->second.IsPromoting = false; - inbootIt->second.PromotingFromFollower = 0; + FinishPromotion(tabletId, inbootIt->second); } OnlineTablets.emplace(tabletId, inbootIt->second); InbootTablets.erase(inbootIt);