Skip to content

Commit dd44e94

Browse files
authored
Reset replication counters correctly to avoid data race on VDisk restart (#5486)
1 parent 7a49804 commit dd44e94

File tree

4 files changed

+37
-2
lines changed

4 files changed

+37
-2
lines changed

ydb/core/blobstorage/nodewarden/node_warden_impl.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,9 @@ namespace NKikimr::NStorage {
335335
bool ReadOnly;
336336
};
337337
std::optional<TRuntimeData> RuntimeData;
338+
bool ShutdownPending = false;
339+
bool RestartAfterShutdown = false;
340+
TDuration YardInitDelay;
338341

339342
// Last VDiskId reported to Node Whiteboard.
340343
std::optional<TVDiskID> WhiteboardVDiskId;
@@ -386,13 +389,15 @@ namespace NKikimr::NStorage {
386389
};
387390

388391
std::map<TVSlotId, TVDiskRecord> LocalVDisks;
392+
THashMap<TActorId, TVSlotId> VDiskIdByActor;
389393
std::map<TVSlotId, ui64> SlayInFlight;
390394
std::set<ui32> PDiskRestartInFlight;
391395
TIntrusiveList<TVDiskRecord, TUnreportedMetricTag> VDisksWithUnreportedMetrics;
392396

393397
void DestroyLocalVDisk(TVDiskRecord& vdisk);
394398
void PoisonLocalVDisk(TVDiskRecord& vdisk);
395399
void StartLocalVDiskActor(TVDiskRecord& vdisk, TDuration yardInitDelay);
400+
void HandleGone(STATEFN_SIG);
396401
void ApplyServiceSetVDisks(const NKikimrBlobStorage::TNodeWardenServiceSet& serviceSet);
397402

398403
// process VDisk configuration
@@ -646,6 +651,8 @@ namespace NKikimr::NStorage {
646651
hFunc(TEvNodeWardenQueryBaseConfig, Handle);
647652
hFunc(TEvNodeConfigInvokeOnRootResult, Handle);
648653

654+
fFunc(TEvents::TSystem::Gone, HandleGone);
655+
649656
default:
650657
EnqueuePendingMessage(ev);
651658
break;

ydb/core/blobstorage/nodewarden/node_warden_vdisk.cpp

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ namespace NKikimr::NStorage {
4848
vdisk.ScrubCookie = 0; // disable reception of Scrub messages from this disk
4949
vdisk.ScrubCookieForController = 0; // and from controller too
5050
vdisk.Status = NKikimrBlobStorage::EVDiskStatus::ERROR;
51+
vdisk.ShutdownPending = true;
5152

5253
SendDiskMetrics(false);
5354
}
@@ -70,6 +71,12 @@ namespace NKikimr::NStorage {
7071
return;
7172
}
7273

74+
if (vdisk.ShutdownPending) {
75+
vdisk.RestartAfterShutdown = true;
76+
vdisk.YardInitDelay = Max(vdisk.YardInitDelay, yardInitDelay);
77+
return;
78+
}
79+
7380
// find underlying PDisk and determine its media type
7481
auto pdiskIt = LocalPDisks.find({vslotId.NodeId, vslotId.PDiskId});
7582
Y_VERIFY_S(pdiskIt != LocalPDisks.end(), "PDiskId# " << vslotId.NodeId << ":" << vslotId.PDiskId << " not found");
@@ -206,8 +213,10 @@ namespace NKikimr::NStorage {
206213

207214
// create an actor
208215
auto *as = TActivationContext::ActorSystem();
209-
as->RegisterLocalService(vdiskServiceId, as->Register(CreateVDisk(vdiskConfig, groupInfo, AppData()->Counters),
210-
TMailboxType::Revolving, AppData()->SystemPoolId));
216+
TActorId actorId = as->Register(CreateVDisk(vdiskConfig, groupInfo, AppData()->Counters),
217+
TMailboxType::Revolving, AppData()->SystemPoolId);
218+
as->RegisterLocalService(vdiskServiceId, actorId);
219+
VDiskIdByActor.try_emplace(actorId, vslotId);
211220

212221
STLOG(PRI_DEBUG, BS_NODE, NW24, "StartLocalVDiskActor done", (VDiskId, vdisk.GetVDiskId()), (VSlotId, vslotId),
213222
(PDiskGuid, pdiskGuid));
@@ -233,6 +242,22 @@ namespace NKikimr::NStorage {
233242
vdisk.ScrubCookie = scrubCookie;
234243
}
235244

245+
void TNodeWarden::HandleGone(STATEFN_SIG) {
246+
if (const auto it = VDiskIdByActor.find(ev->Sender); it != VDiskIdByActor.end()) {
247+
if (const auto jt = LocalVDisks.find(it->second); jt != LocalVDisks.end()) {
248+
TVDiskRecord& vdisk = jt->second;
249+
Y_ABORT_UNLESS(vdisk.ShutdownPending);
250+
vdisk.ShutdownPending = false;
251+
if (vdisk.RestartAfterShutdown) {
252+
StartLocalVDiskActor(vdisk, vdisk.YardInitDelay);
253+
vdisk.RestartAfterShutdown = false;
254+
vdisk.YardInitDelay = TDuration::Zero();
255+
}
256+
}
257+
VDiskIdByActor.erase(it);
258+
}
259+
}
260+
236261
void TNodeWarden::ApplyServiceSetVDisks(const NKikimrBlobStorage::TNodeWardenServiceSet& serviceSet) {
237262
for (const auto& vdisk : serviceSet.GetVDisks()) {
238263
ApplyLocalVDiskInfo(vdisk);

ydb/core/blobstorage/ut_group/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ class TNodeWardenMockActor : public TActor<TNodeWardenMockActor> {
123123
cFunc(TEvBlobStorage::EvDropDonor, Ignore);
124124
cFunc(TEvBlobStorage::EvGroupStatReport, Ignore);
125125
cFunc(TEvBlobStorage::EvNotifyVDiskGenerationChange, Ignore);
126+
cFunc(TEvents::TSystem::Gone, Ignore);
126127

127128
fFunc(TEvBlobStorage::EvPut, ForwardToProxy);
128129
fFunc(TEvBlobStorage::EvGet, ForwardToProxy);

ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2209,6 +2209,8 @@ namespace NKikimr {
22092209

22102210
void PassAway() override {
22112211
VDiskCountersBase->RemoveSubgroupChain(CountersChain);
2212+
TActivationContext::Send(new IEventHandle(TEvents::TSystem::Gone, 0,
2213+
MakeBlobStorageNodeWardenID(SelfId().NodeId()), SelfId(), nullptr, 0));
22122214
TActorBootstrapped::PassAway();
22132215
}
22142216
};

0 commit comments

Comments
 (0)