Skip to content

Commit 4b0a8a2

Browse files
authored
Add PDisk error reason to VDisk's PDisk error state (#9302)
1 parent 22f751a commit 4b0a8a2

File tree

4 files changed

+71
-42
lines changed

4 files changed

+71
-42
lines changed

ydb/core/blobstorage/vdisk/common/vdisk_context.h

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -80,19 +80,13 @@ namespace NKikimr {
8080
TOutOfSpaceState OutOfSpaceState;
8181
// Global stat about huge heap fragmentation
8282
THugeHeapFragmentation HugeHeapFragmentation;
83-
// Tracks PDisk errors
84-
TPDiskErrorState PDiskErrorState;
8583
friend class TDskSpaceTrackerActor;
8684

8785
NMonGroup::TCostGroup CostMonGroup;
8886

8987
public:
9088
TLogger Logger;
9189

92-
TPDiskErrorState::EState GetPDiskErrorState() const {
93-
return PDiskErrorState.GetState();
94-
}
95-
9690
public:
9791
TVDiskContext(
9892
const TActorId &vdiskActorId,
@@ -137,14 +131,11 @@ namespace NKikimr {
137131
case NKikimrProto::CORRUPTED:
138132
case NKikimrProto::OUT_OF_SPACE: {
139133
// Device is out of order
140-
PDiskErrorState.Set(ev.Status, ev.StatusFlags);
141-
auto newState = PDiskErrorState.GetState();
142134
LOG_ERROR(actorSystemOrCtx, NKikimrServices::BS_VDISK_OTHER,
143135
VDISKP(VDiskLogPrefix,
144-
"CheckPDiskResponse: Recoverable error from PDisk: %s newState# %s",
145-
FormatMessage(ev.Status, ev.ErrorReason, ev.StatusFlags, message).data(),
146-
TPDiskErrorState::StateToString(newState)));
147-
actorSystemOrCtx.Send(VDiskActorId, new TEvPDiskErrorStateChange(newState));
136+
"CheckPDiskResponse: Recoverable error from PDisk: %s",
137+
FormatMessage(ev.Status, ev.ErrorReason, ev.StatusFlags, message).data()));
138+
actorSystemOrCtx.Send(VDiskActorId, new TEvPDiskErrorStateChange(ev.Status, ev.StatusFlags, ev.ErrorReason));
148139
return false;
149140
}
150141
default:

ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
#pragma once
2+
23
#include "defs.h"
3-
#include <ydb/core/base/blobstorage.h>
44
#include "vdisk_config.h"
55

6+
#include <ydb/core/base/blobstorage.h>
7+
68
namespace NKikimr {
79

810
////////////////////////////////////////////////////////////////////////////
@@ -31,43 +33,52 @@ namespace NKikimr {
3133
}
3234

3335
TPDiskErrorState() {
34-
SetPrivate(Good);
36+
SetPrivate(Good, "");
3537
}
3638

3739
EState GetState() const {
38-
return static_cast<EState>(AtomicGet(State));
40+
return State;
41+
}
42+
43+
const TString& GetErrorReason() const {
44+
return ErrorReason;
3945
}
4046

4147
// We call this function when PDisk returned ERROR and we pass pdiskFlags to set the correct state
42-
EState Set(NKikimrProto::EReplyStatus status, ui32 pdiskFlags) {
48+
EState Set(NKikimrProto::EReplyStatus status, ui32 pdiskFlags, const TString& errorReason) {
4349
switch (status) {
4450
case NKikimrProto::CORRUPTED:
45-
return SetPrivate(NoWrites);
51+
return SetPrivate(NoWrites, errorReason);
4652
case NKikimrProto::OUT_OF_SPACE:
4753
// check flags additionally
4854
Y_ABORT_UNLESS(pdiskFlags & NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation);
49-
return SetPrivate(WriteOnlyLog);
55+
return SetPrivate(WriteOnlyLog, errorReason);
5056
default:
5157
Y_ABORT("Unexpected state# %s", NKikimrProto::EReplyStatus_Name(status).data());
5258
}
5359
}
5460

55-
private:
56-
TAtomic State = 0;
57-
58-
EState SetPrivate(EState state) {
59-
// make sure bad state increments (not decrements), use CAS for that
60-
while (true) {
61-
EState curState = GetState();
62-
if (state > curState) {
63-
// if state is worse than curState:
64-
TAtomicBase newState = static_cast<TAtomicBase>(state);
65-
bool done = AtomicCas(&State, newState, curState);
66-
if (done)
67-
return state;
68-
} else
69-
return curState;
61+
TString ToString() const {
62+
TStringStream ss;
63+
ss << "State# " << StateToString(State);
64+
if (!ErrorReason.empty()) {
65+
ss << ", PDiskError# " << ErrorReason;
7066
}
67+
return ss.Str();
68+
}
69+
70+
private:
71+
EState State = EState::Unspecified;
72+
73+
TString ErrorReason;
74+
75+
EState SetPrivate(EState state, const TString& errorReason) {
76+
if (state > State) {
77+
State = state;
78+
ErrorReason = errorReason;
79+
return state;
80+
} else
81+
return State;
7182
}
7283
};
7384

@@ -80,10 +91,14 @@ namespace NKikimr {
8091
struct TEvPDiskErrorStateChange :
8192
public TEventLocal<TEvPDiskErrorStateChange, TEvBlobStorage::EvPDiskErrorStateChange>
8293
{
83-
const TPDiskErrorState::EState State;
94+
const NKikimrProto::EReplyStatus Status;
95+
const ui32 PDiskFlags;
96+
const TString ErrorReason;
8497

85-
TEvPDiskErrorStateChange(TPDiskErrorState::EState state)
86-
: State(state)
98+
TEvPDiskErrorStateChange(NKikimrProto::EReplyStatus status, ui32 pdiskFlags, const TString &errorReason)
99+
: Status(status)
100+
, PDiskFlags(pdiskFlags)
101+
, ErrorReason(errorReason)
87102
{}
88103
};
89104

ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,39 @@ namespace NKikimr {
1616
Y_UNIT_TEST(Basic) {
1717
TPDiskErrorState state;
1818
UNIT_ASSERT(state.GetState() == TPDiskErrorState::Good);
19+
UNIT_ASSERT(state.GetErrorReason().Empty());
1920

20-
state.Set(NKikimrProto::CORRUPTED, 0);
21+
state.Set(NKikimrProto::CORRUPTED, 0, "");
2122
UNIT_ASSERT(state.GetState() == TPDiskErrorState::NoWrites);
23+
UNIT_ASSERT(state.GetErrorReason().Empty());
2224
}
2325

2426
Y_UNIT_TEST(Basic2) {
2527
TPDiskErrorState state;
2628
UNIT_ASSERT(state.GetState() == TPDiskErrorState::Good);
29+
UNIT_ASSERT(state.GetErrorReason().Empty());
2730

28-
state.Set(NKikimrProto::OUT_OF_SPACE, NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation);
31+
state.Set(NKikimrProto::OUT_OF_SPACE, NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation, "");
2932
UNIT_ASSERT(state.GetState() == TPDiskErrorState::WriteOnlyLog);
33+
UNIT_ASSERT(state.GetErrorReason().Empty());
3034

31-
state.Set(NKikimrProto::CORRUPTED, 0);
35+
state.Set(NKikimrProto::CORRUPTED, 0, "");
3236
UNIT_ASSERT(state.GetState() == TPDiskErrorState::NoWrites);
37+
UNIT_ASSERT(state.GetErrorReason().Empty());
38+
}
39+
40+
Y_UNIT_TEST(BasicErrorReason) {
41+
TPDiskErrorState state;
42+
UNIT_ASSERT(state.GetState() == TPDiskErrorState::Good);
43+
UNIT_ASSERT(state.GetErrorReason().Empty());
44+
45+
state.Set(NKikimrProto::OUT_OF_SPACE, NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation, "Foo");
46+
UNIT_ASSERT(state.GetState() == TPDiskErrorState::WriteOnlyLog);
47+
UNIT_ASSERT(state.GetErrorReason() == "Foo");
48+
49+
state.Set(NKikimrProto::CORRUPTED, 0, "Bar");
50+
UNIT_ASSERT(state.GetState() == TPDiskErrorState::NoWrites);
51+
UNIT_ASSERT(state.GetErrorReason() == "Bar");
3352
}
3453

3554
}

ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,7 @@ namespace NKikimr {
636636
friend class TIntQueueClass;
637637

638638
TVDiskContextPtr VCtx;
639+
TPDiskErrorState PDiskErrorState;
639640
TIntrusivePtr<TVDiskConfig> Config;
640641
TIntrusivePtr<TBlobStorageGroupInfo> GInfo;
641642
std::shared_ptr<TBlobStorageGroupInfo::TTopology> Top;
@@ -934,7 +935,7 @@ namespace NKikimr {
934935
TABLED() {str << "Error Details";}
935936
TABLED() {
936937
str << "PDisk reported error: "
937-
<< TPDiskErrorState::StateToString(VCtx->GetPDiskErrorState());
938+
<< PDiskErrorState.ToString();
938939
}
939940
}
940941
} else if (VCtx->LocalRecoveryErrorStr) {
@@ -1663,12 +1664,15 @@ namespace NKikimr {
16631664
}
16641665

16651666
void Handle(TEvPDiskErrorStateChange::TPtr &ev, const TActorContext &ctx) {
1667+
auto errorStateChange = ev->Get();
1668+
1669+
PDiskErrorState.Set(errorStateChange->Status, errorStateChange->PDiskFlags, errorStateChange->ErrorReason);
1670+
16661671
LOG_ERROR_S(ctx, NKikimrServices::BS_SKELETON, VCtx->VDiskLogPrefix
16671672
<< "SkeletonFront: got TEvPDiskErrorStateChange;"
1668-
<< " state# " << TPDiskErrorState::StateToString(ev->Get()->State)
1673+
<< PDiskErrorState.ToString()
16691674
<< " Marker# BSVSF03");
16701675

1671-
16721676
// switch skeleton state to PDiskError
16731677
SkeletonFrontGroup->ResetCounters();
16741678
VDiskMonGroup.VDiskState(NKikimrWhiteboard::EVDiskState::PDiskError);

0 commit comments

Comments
 (0)