Skip to content

Commit bba21cb

Browse files
authored
Support processing unhandled exceptions in actors and tablets (#15468)
1 parent 1b4e25c commit bba21cb

File tree

14 files changed

+249
-21
lines changed

14 files changed

+249
-21
lines changed

ydb/core/driver_lib/run/main.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,10 @@ std::terminate_handler defaultTerminateHandler;
178178
void KikimrTerminateHandler() {
179179
Cerr << "======= terminate() call stack ========\n";
180180
FormatBackTrace(&Cerr);
181+
if (auto backtrace = TBackTrace::FromCurrentException(); backtrace.size() > 0) {
182+
Cerr << "======== exception call stack =========\n";
183+
backtrace.PrintTo(Cerr);
184+
}
181185
Cerr << "=======================================\n";
182186

183187
auto oldHandler = defaultTerminateHandler;

ydb/core/protos/feature_flags.proto

+1
Original file line numberDiff line numberDiff line change
@@ -199,4 +199,5 @@ message TFeatureFlags {
199199
optional bool EnableShowCreate = 173 [default = false];
200200
optional bool EnableChangefeedsExport = 174 [default = false];
201201
optional bool EnableKafkaNativeBalancing = 175 [default = false];
202+
optional bool EnableTabletRestartOnUnhandledExceptions = 176 [default = true];
202203
}

ydb/core/tablet_flat/flat_executor.cpp

+18-1
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,20 @@ TExecutor::~TExecutor() {
139139

140140
}
141141

142+
bool TExecutor::OnUnhandledException(const std::exception& e) {
143+
if (AppData()->FeatureFlags.GetEnableTabletRestartOnUnhandledExceptions()) {
144+
if (auto log = Logger->Log(ELnLev::Crit)) {
145+
log << "Tablet " << TabletId() << " unhandled exception " << TypeName(e) << ": " << e.what()
146+
<< '\n' << TBackTrace::FromCurrentException().PrintToString();
147+
}
148+
Broken();
149+
return true;
150+
}
151+
152+
// Exception will propagate and cause the process to crash
153+
return false;
154+
}
155+
142156
ui64 TExecutor::Stamp() const noexcept
143157
{
144158
return CommitManager ? CommitManager->Stamp() : TTxStamp{ Generation0, Step0 }.Raw;
@@ -168,6 +182,7 @@ void TExecutor::Registered(TActorSystem *sys, const TActorId&)
168182
GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_req_nodata", true);
169183
GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_scan_nodata", true);
170184
GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_boot_nodata", true);
185+
GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_broken", true);
171186
}
172187

173188
void TExecutor::PassAway() {
@@ -195,6 +210,8 @@ void TExecutor::PassAway() {
195210
}
196211

197212
void TExecutor::Broken() {
213+
GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_broken", true)->Inc();
214+
198215
if (BootLogic)
199216
BootLogic->Cancel();
200217

@@ -888,7 +905,7 @@ void TExecutor::Restored(TEvTablet::TEvRestored::TPtr &ev, const TActorContext &
888905
return TranscriptBootOpResult(res, ctx);
889906
}
890907

891-
void TExecutor::DetachTablet(const TActorContext &) {
908+
void TExecutor::DetachTablet() {
892909
TabletCountersForgetTablet(Owner->TabletID(), Owner->TabletType(),
893910
Owner->Info()->TenantPathId, Stats->IsFollower(), SelfId());
894911
return PassAway();

ydb/core/tablet_flat/flat_executor.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ struct TExecutorCaches {
312312
class TExecutor
313313
: public TActor<TExecutor>
314314
, public NFlatExecutorSetup::IExecutor
315+
, public IActorExceptionHandler
315316
, private NTable::ICompactionBackend
316317
, private ILoadBlob
317318
{
@@ -633,7 +634,7 @@ class TExecutor
633634
// IExecutor interface
634635
void Boot(TEvTablet::TEvBoot::TPtr &ev, const TActorContext &ctx) override;
635636
void Restored(TEvTablet::TEvRestored::TPtr &ev, const TActorContext &ctx) override;
636-
void DetachTablet(const TActorContext &ctx) override;
637+
void DetachTablet() override;
637638
ui64 DoExecute(TAutoPtr<ITransaction> transaction, ETxMode mode);
638639
void Execute(TAutoPtr<ITransaction> transaction, const TActorContext &ctx) override;
639640
ui64 Enqueue(TAutoPtr<ITransaction> transaction) override;
@@ -705,6 +706,8 @@ class TExecutor
705706
TExecutor(NFlatExecutorSetup::ITablet *owner, const TActorId& ownerActorId);
706707
~TExecutor();
707708

709+
bool OnUnhandledException(const std::exception&) override;
710+
708711
STFUNC(StateInit);
709712
STFUNC(StateBoot);
710713
STFUNC(StateWork);

ydb/core/tablet_flat/flat_executor_ut.cpp

+37-1
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,7 @@ class TTestFlatTablet : public TActor<TTestFlatTablet>, public TTabletExecutedFl
523523

524524
void Handle(TEvents::TEvPoison::TPtr &, const TActorContext &ctx) {
525525
Become(&TThis::StateBroken);
526-
Executor()->DetachTablet(ctx), Detach(ctx); /* see TDummy tablet */
526+
Executor()->DetachTablet(), Detach(ctx); /* see TDummy tablet */
527527
ctx.Send(Sender, new TEvents::TEvGone);
528528
}
529529

@@ -7354,5 +7354,41 @@ Y_UNIT_TEST_SUITE(TFlatTableExecutor_LowPriorityTxs) {
73547354
}
73557355
}
73567356

7357+
Y_UNIT_TEST_SUITE(TFlatTableExecutor_Exceptions) {
7358+
struct TTxExecuteThrowException : public ITransaction {
7359+
bool Execute(TTransactionContext&, const TActorContext&) override {
7360+
throw std::runtime_error("test");
7361+
}
7362+
7363+
void Complete(const TActorContext&) override {
7364+
// not reached
7365+
}
7366+
};
7367+
7368+
Y_UNIT_TEST(TestTabletExecuteExceptionDirect) {
7369+
TMyEnvBase env;
7370+
env->GetAppData().FeatureFlags.SetEnableTabletRestartOnUnhandledExceptions(true);
7371+
7372+
env.FireDummyTablet();
7373+
7374+
env.SendAsync(new NFake::TEvExecute([&](auto* x, const auto& ctx) {
7375+
x->Execute(new TTxExecuteThrowException, ctx);
7376+
}));
7377+
env.WaitForGone();
7378+
}
7379+
7380+
Y_UNIT_TEST(TestTabletExecuteExceptionEnqueue) {
7381+
TMyEnvBase env;
7382+
env->GetAppData().FeatureFlags.SetEnableTabletRestartOnUnhandledExceptions(true);
7383+
7384+
env.FireDummyTablet();
7385+
7386+
env.SendAsync(new NFake::TEvExecute([&](auto* x, const auto&) {
7387+
x->Enqueue(new TTxExecuteThrowException);
7388+
}));
7389+
env.WaitForGone();
7390+
}
7391+
}
7392+
73577393
}
73587394
}

ydb/core/tablet_flat/tablet_flat_executed.cpp

+26-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "flat_executor.h"
33
#include "flat_executor_counters.h"
44
#include <ydb/core/base/appdata.h>
5+
#include <ydb/core/base/counters.h>
56
#include <library/cpp/monlib/service/pages/templates.h>
67

78
namespace NKikimr {
@@ -16,6 +17,28 @@ TTabletExecutedFlat::TTabletExecutedFlat(TTabletStorageInfo *info, const TActorI
1617
, StartTime0(TAppData::TimeProvider->Now())
1718
{}
1819

20+
bool TTabletExecutedFlat::OnUnhandledException(const std::exception& e) {
21+
if (AppData()->FeatureFlags.GetEnableTabletRestartOnUnhandledExceptions()) {
22+
// Tablets have a weird inheritence where subclass is always an actor,
23+
// but we don't know the exact type at compile time. This dynamic_cast
24+
// is expected to always succeed.
25+
if (auto* actor = dynamic_cast<IActor*>(this)) {
26+
auto ctx = TActivationContext::ActorContextFor(actor->SelfId());
27+
LOG_CRIT_S(*TlsActivationContext, NKikimrServices::TABLET_EXECUTOR,
28+
"Tablet " << TabletID() << " unhandled exception " << TypeName(e) << ": " << e.what()
29+
<< '\n' << TBackTrace::FromCurrentException().PrintToString());
30+
31+
GetServiceCounters(AppData(ctx)->Counters, "tablets")->GetCounter("alerts_broken", true)->Inc();
32+
33+
HandlePoison(ctx);
34+
return true;
35+
}
36+
}
37+
38+
// Exception will propagate and cause the process to crash
39+
return false;
40+
}
41+
1942
IExecutor* TTabletExecutedFlat::CreateExecutor(const TActorContext &ctx) {
2043
if (!Executor()) {
2144
IActor *executor = NFlatExecutorSetup::CreateExecutor(this, ctx.SelfID);
@@ -123,9 +146,9 @@ void TTabletExecutedFlat::OnTabletStop(TEvTablet::TEvTabletStop::TPtr &ev, const
123146
ctx.Send(Tablet(), new TEvTablet::TEvTabletStopped());
124147
}
125148

126-
void TTabletExecutedFlat::HandlePoison(const TActorContext &ctx) {
149+
void TTabletExecutedFlat::HandlePoison(const TActorContext& ctx) {
127150
if (Executor0) {
128-
Executor0->DetachTablet(ExecutorCtx(ctx));
151+
Executor0->DetachTablet();
129152
Executor0 = nullptr;
130153
}
131154

@@ -142,7 +165,7 @@ void TTabletExecutedFlat::HandleTabletStop(TEvTablet::TEvTabletStop::TPtr &ev, c
142165

143166
void TTabletExecutedFlat::HandleTabletDead(TEvTablet::TEvTabletDead::TPtr &ev, const TActorContext &ctx) {
144167
if (Executor0) {
145-
Executor0->DetachTablet(ExecutorCtx(ctx));
168+
Executor0->DetachTablet();
146169
Executor0 = nullptr;
147170
}
148171

ydb/core/tablet_flat/tablet_flat_executed.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,19 @@ struct IMiniKQLFactory {
1616
virtual TAutoPtr<ITransaction> Make(TEvTablet::TEvLocalReadColumns::TPtr&) = 0;
1717
};
1818

19-
class TTabletExecutedFlat : public NFlatExecutorSetup::ITablet {
19+
class TTabletExecutedFlat
20+
: public NFlatExecutorSetup::ITablet
21+
, public IActorExceptionHandler
22+
{
2023
protected:
2124
using IExecutor = NFlatExecutorSetup::IExecutor;
2225

2326
TTabletExecutedFlat(TTabletStorageInfo *info, const TActorId &tablet, IMiniKQLFactory *factory);
2427
IExecutor* Executor() const { return Executor0; }
2528
const TInstant StartTime() const { return StartTime0; }
2629

30+
bool OnUnhandledException(const std::exception&) override;
31+
2732
void Execute(TAutoPtr<ITransaction> transaction, const TActorContext &ctx);
2833
void Execute(TAutoPtr<ITransaction> transaction);
2934
ui64 Enqueue(TAutoPtr<ITransaction> transaction);

ydb/core/tablet_flat/tablet_flat_executor.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,7 @@ namespace NFlatExecutorSetup {
550550
// tablet generation restoration complete, tablet could act as leader
551551
virtual void Restored(TEvTablet::TEvRestored::TPtr &ev, const TActorContext &ctx) = 0;
552552
// die!
553-
virtual void DetachTablet(const TActorContext &ctx) = 0;
553+
virtual void DetachTablet() = 0;
554554

555555
// tablet assigned as follower (or follower connection refreshed), must begin loading
556556
virtual void FollowerBoot(TEvTablet::TEvFBoot::TPtr &ev, const TActorContext &ctx) = 0;

ydb/core/tablet_flat/test/libs/exec/dummy.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ namespace NFake {
1515
virtual NFake::TEvExecute* OnFinished() = 0;
1616
};
1717

18-
class TDummy : public ::NActors::IActorCallback, public TExecuted {
18+
class TDummy : public TActor<TDummy>, public TExecuted {
1919
enum EState {
2020
Boot = 1,
2121
Work = 2,
@@ -35,7 +35,7 @@ namespace NFake {
3535

3636
TDummy(const TActorId &tablet, TInfo *info, const TActorId& owner,
3737
ui32 flags = 0 /* ORed EFlg enum */)
38-
: ::NActors::IActorCallback(static_cast<TReceiveFunc>(&TDummy::Inbox), NKikimrServices::TActivity::FAKE_ENV_A)
38+
: TActor(&TDummy::Inbox, NKikimrServices::TActivity::FAKE_ENV_A)
3939
, TTabletExecutedFlat(info, tablet, nullptr)
4040
, Owner(owner)
4141
, Flags(flags)
@@ -75,7 +75,7 @@ namespace NFake {
7575
*/
7676

7777
auto ctx(this->ActorContext());
78-
Executor()->DetachTablet(ctx), Detach(ctx);
78+
Executor()->DetachTablet(), Detach(ctx);
7979
}
8080
} else if (State == EState::Boot) {
8181
TTabletExecutedFlat::StateInitImpl(eh, SelfId());

ydb/core/testlib/actors/test_runtime.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,11 @@ namespace NActors {
123123
NKikimr::TAppData::TimeProvider = TimeProvider;
124124
}
125125

126+
// We want tests to fail on unhandled exceptions by default
127+
if (!App0->FeatureFlags.HasEnableTabletRestartOnUnhandledExceptions()) {
128+
App0->FeatureFlags.SetEnableTabletRestartOnUnhandledExceptions(false);
129+
}
130+
126131
MonPorts.clear();
127132
for (ui32 nodeIndex = 0; nodeIndex < NodeCount; ++nodeIndex) {
128133
ui32 nodeId = FirstNodeId + nodeIndex;

ydb/library/actors/core/actor.cpp

+20
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,26 @@ namespace NActors {
267267
return NHPTimer::GetSeconds(ElapsedTicks);
268268
}
269269

270+
void IActor::Receive(TAutoPtr<IEventHandle>& ev) {
271+
#ifndef NDEBUG
272+
if (ev->Flags & IEventHandle::FlagDebugTrackReceive) {
273+
YaDebugBreak();
274+
}
275+
#endif
276+
++HandledEvents;
277+
LastReceiveTimestamp = TActivationContext::Monotonic();
278+
279+
try {
280+
(this->*StateFunc_)(ev);
281+
} catch(const std::exception& e) {
282+
if (auto* handler = dynamic_cast<IActorExceptionHandler*>(this);
283+
!handler || !handler->OnUnhandledException(e))
284+
{
285+
throw;
286+
}
287+
}
288+
}
289+
270290
void IActor::Registered(TActorSystem* sys, const TActorId& owner) {
271291
// fallback to legacy method, do not use it anymore
272292
if (auto eh = AfterRegister(SelfId(), owner)) {

ydb/library/actors/core/actor.h

+18-10
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,23 @@ namespace NActors {
347347
void DoActorInit() { LastUsageTimestamp = GetCycleCountFast(); }
348348
};
349349

350+
/**
351+
* Optional interface for actors with exception handling
352+
*/
353+
class IActorExceptionHandler {
354+
protected:
355+
~IActorExceptionHandler() = default;
356+
357+
public:
358+
/**
359+
* Called when actor's event handler throws an std::exception subclass
360+
*
361+
* The implementation is supposed to return true for handled exceptions
362+
* and false to rethrow (which will likely result in a process crash).
363+
*/
364+
virtual bool OnUnhandledException(const std::exception&) = 0;
365+
};
366+
350367
class IActor
351368
: protected IActorOps
352369
, public TActorUsageImpl<ActorLibCollectUsageStats>
@@ -547,16 +564,7 @@ namespace NActors {
547564
return SelfActorId;
548565
}
549566

550-
void Receive(TAutoPtr<IEventHandle>& ev) {
551-
#ifndef NDEBUG
552-
if (ev->Flags & IEventHandle::FlagDebugTrackReceive) {
553-
YaDebugBreak();
554-
}
555-
#endif
556-
++HandledEvents;
557-
LastReceiveTimestamp = TActivationContext::Monotonic();
558-
(this->*StateFunc_)(ev);
559-
}
567+
void Receive(TAutoPtr<IEventHandle>& ev);
560568

561569
TActorContext ActorContext() const {
562570
return TActivationContext::ActorContextFor(SelfId());

0 commit comments

Comments
 (0)