Skip to content

do not trigger dead tablet issue during creation of a lot of tablets… #10398

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 28 additions & 13 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,21 +189,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
int Count = 1;
TStackVec<TString> Identifiers;

TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) {
Type = info.tablettype();
Leader = info.followerid() == 0;
static ETabletState GetState(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) {
if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_STOPPED) {
State = ETabletState::Stopped;
} else if (!settings.IsHiveSynchronizationPeriod
&& info.volatilestate() != NKikimrHive::TABLET_VOLATILE_STATE_RUNNING
&& TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier
&& info.tabletbootmode() == NKikimrHive::TABLET_BOOT_MODE_DEFAULT) {
State = ETabletState::Dead;
} else if (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) {
State = ETabletState::RestartsTooOften;
} else {
State = ETabletState::Good;
return ETabletState::Stopped;
}
ETabletState state = (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) ? ETabletState::RestartsTooOften : ETabletState::Good;
if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_RUNNING) {
return state;
}
if (info.tabletbootmode() != NKikimrHive::TABLET_BOOT_MODE_DEFAULT) {
return state;
}
if (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier) {
// Tablet is not alive for a long time
// We should report it as dead unless it's just waiting to be created
if (info.generation() == 0 && info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_BOOTING && !info.inwaitqueue()) {
return state;
}
return ETabletState::Dead;
}
return state;

}

TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings)
: Type(info.tablettype())
, State(GetState(info, settings))
, Leader(info.followerid() == 0)
{
}

bool operator ==(const TNodeTabletStateCount& o) const {
Expand Down Expand Up @@ -1983,6 +1996,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
}

// do not propagate RED status to vdisk - so that vdisk is not considered down when computing group status
context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::ORANGE);
storagePDiskStatus.set_overall(context.GetOverallStatus());
}

Expand Down
209 changes: 115 additions & 94 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <ydb/core/testlib/test_client.h>
#include <ydb/public/lib/deprecated/kicli/kicli.h>

#include <ydb/core/mind/hive/hive_events.h>
#include <ydb/core/node_whiteboard/node_whiteboard.h>
#include <ydb/core/blobstorage/base/blobstorage_events.h>
#include <ydb/core/tx/schemeshard/schemeshard.h>
Expand Down Expand Up @@ -68,7 +69,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {

struct TTestVSlotInfo {
std::optional<NKikimrBlobStorage::EVDiskStatus> Status;
ui32 Generation;
ui32 Generation = DEFAULT_GROUP_GENERATION;
NKikimrBlobStorage::EDriveStatus PDiskStatus = NKikimrBlobStorage::ACTIVE;

TTestVSlotInfo(std::optional<NKikimrBlobStorage::EVDiskStatus> status = NKikimrBlobStorage::READY,
ui32 generation = DEFAULT_GROUP_GENERATION)
Expand All @@ -77,7 +79,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
{
}

TTestVSlotInfo(NKikimrBlobStorage::EVDiskStatus status) : Status(status), Generation(DEFAULT_GROUP_GENERATION) {}
TTestVSlotInfo(NKikimrBlobStorage::EVDiskStatus status, NKikimrBlobStorage::EDriveStatus pDiskStatus = NKikimrBlobStorage::ACTIVE)
: Status(status)
, PDiskStatus(pDiskStatus)
{
}
};

using TVDisks = TVector<TTestVSlotInfo>;
Expand Down Expand Up @@ -222,18 +228,20 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
entry->mutable_info()->set_name(STORAGE_POOL_NAME);
}

void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, size_t count, double occupancy) {
void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, const TVDisks& vslots, double occupancy) {
auto& record = (*ev)->Get()->Record;
auto entrySample = record.entries(0);
record.clear_entries();
auto pdiskId = PDISK_START_ID;
const size_t totalSize = 3'200'000'000'000ull;
for (size_t i = 0; i < count; ++i) {
const auto *descriptor = NKikimrBlobStorage::EDriveStatus_descriptor();
for (const auto& vslot : vslots) {
auto* entry = record.add_entries();
entry->CopyFrom(entrySample);
entry->mutable_key()->set_pdiskid(pdiskId);
entry->mutable_info()->set_totalsize(totalSize);
entry->mutable_info()->set_availablesize((1 - occupancy) * totalSize);
entry->mutable_info()->set_statusv2(descriptor->FindValueByNumber(vslot.PDiskStatus)->name());
++pdiskId;
}
}
Expand Down Expand Up @@ -482,7 +490,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
case NSysView::TEvSysView::EvGetPDisksResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetPDisksResponse::TPtr*>(&ev);
AddPDisksToSysViewResponse(x, vdisks.size(), occupancy);
AddPDisksToSysViewResponse(x, vdisks, occupancy);
break;
}
case NSysView::TEvSysView::EvGetGroupsResponse: {
Expand Down Expand Up @@ -710,6 +718,14 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
}

Y_UNIT_TEST(YellowIssueReadyVDisksOnFaultyPDisks) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, {NKikimrBlobStorage::READY, NKikimrBlobStorage::FAULTY}});
Cerr << result.ShortDebugString() << Endl;
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 0);
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 0);
}

/* HC currently infers group status on its own, so it's never unknown
Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});
Expand Down Expand Up @@ -1818,123 +1834,128 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT_VALUES_EQUAL(database_status.storage().pools()[0].id(), "static");
}

void HiveSyncTest(bool syncPeriod) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Y_UNIT_TEST(ShardsLimit999) {
ShardsQuotaTest(999, 1000, 1, Ydb::Monitoring::StatusFlag::RED);
}

Y_UNIT_TEST(ShardsLimit995) {
ShardsQuotaTest(995, 1000, 1, Ydb::Monitoring::StatusFlag::ORANGE);
}

Y_UNIT_TEST(ShardsLimit905) {
ShardsQuotaTest(905, 1000, 1, Ydb::Monitoring::StatusFlag::YELLOW);
}

Y_UNIT_TEST(ShardsLimit800) {
ShardsQuotaTest(805, 1000, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

Y_UNIT_TEST(ShardsNoLimit) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
for (const auto& issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
return true;
}
}
return false;
}

Y_UNIT_TEST(TestTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(1)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);
TTestActorRuntime& runtime = *server.GetRuntime();

ui32 dynNodeId = runtime.GetNodeId(1);
TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
switch (ev->GetTypeRewrite()) {
case TEvHive::EvResponseHiveInfo: {
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveInfo::TPtr*>(&ev);
auto& record = (*x)->Get()->Record;
record.SetStartTimeTimestamp(0);
if (syncPeriod) {
record.SetResponseTimestamp(NHealthCheck::TSelfCheckRequest::HIVE_SYNCHRONIZATION_PERIOD_MS / 2);
} else {
record.SetResponseTimestamp(NHealthCheck::TSelfCheckRequest::HIVE_SYNCHRONIZATION_PERIOD_MS * 2);
}
auto *tablet = record.MutableTablets()->Add();
tablet->SetTabletID(1);
tablet->SetNodeID(dynNodeId);
tablet->SetTabletType(NKikimrTabletBase::TTabletTypes::DataShard);
tablet->SetVolatileState(NKikimrHive::TABLET_VOLATILE_STATE_BOOTING);
tablet->MutableObjectDomain()->SetSchemeShard(SUBDOMAIN_KEY.OwnerId);
tablet->MutableObjectDomain()->SetPathId(SUBDOMAIN_KEY.LocalPathId);
break;
}
case TEvHive::EvResponseHiveNodeStats: {
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveNodeStats::TPtr*>(&ev);
auto &record = (*x)->Get()->Record;
auto *nodeStats = record.MutableNodeStats()->Add();
nodeStats->SetNodeId(dynNodeId);
nodeStats->MutableNodeDomain()->SetSchemeShard(SUBDOMAIN_KEY.OwnerId);
nodeStats->MutableNodeDomain()->SetPathId(SUBDOMAIN_KEY.LocalPathId);
break;
}
case NConsole::TEvConsole::EvGetTenantStatusResponse: {
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvGetTenantStatusResponse::TPtr*>(&ev);
ChangeGetTenantStatusResponse(x, "/Root/database");
break;
}
case TEvTxProxySchemeCache::EvNavigateKeySetResult: {
auto *x = reinterpret_cast<TEvTxProxySchemeCache::TEvNavigateKeySetResult::TPtr*>(&ev);
TSchemeCacheNavigate::TEntry& entry((*x)->Get()->Request->ResultSet.front());
entry.Status = TSchemeCacheNavigate::EStatus::Ok;
entry.Kind = TSchemeCacheNavigate::EKind::KindExtSubdomain;
entry.Path = {"Root", "database"};
entry.DomainInfo = MakeIntrusive<TDomainInfo>(SUBDOMAIN_KEY, SUBDOMAIN_KEY);
server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1);
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

break;
}
}
TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

return TTestActorRuntime::EEventAction::PROCESS;
};
runtime.SetObserverFunc(observerFunc);
UNIT_ASSERT(HasDeadTabletIssue(result));
}

TActorId sender = runtime.AllocateEdgeActor();
TAutoPtr<IEventHandle> handle;
Y_UNIT_TEST(TestBootingTabletIsNotDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

auto *request = new NHealthCheck::TEvSelfCheckRequest;
request->Request.set_return_verbose_status(true);
request->Database = "/Root/database";
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
const auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
TClient client(settings);

Cerr << result.ShortDebugString() << Endl;
TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

UNIT_ASSERT_VALUES_EQUAL(result.database_status_size(), 1);
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });

bool deadTabletIssueFoundInResult = false;
for (const auto &issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().compute().tablet().id().size(), 1);
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().compute().tablet().type(), "DataShard");
deadTabletIssueFoundInResult = true;
}
}
server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, false);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

UNIT_ASSERT_VALUES_EQUAL(syncPeriod, !deadTabletIssueFoundInResult);
}
TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

Y_UNIT_TEST(HiveSyncPeriodIgnoresTabletsState) {
HiveSyncTest(true);
UNIT_ASSERT(!HasDeadTabletIssue(result));
}

Y_UNIT_TEST(AfterHiveSyncPeriodReportsTabletsState) {
HiveSyncTest(false);
}
Y_UNIT_TEST(TestReBootingTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(2)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

Y_UNIT_TEST(ShardsLimit999) {
ShardsQuotaTest(999, 1000, 1, Ydb::Monitoring::StatusFlag::RED);
}
TClient client(settings);

Y_UNIT_TEST(ShardsLimit995) {
ShardsQuotaTest(995, 1000, 1, Ydb::Monitoring::StatusFlag::ORANGE);
}
TTestActorRuntime* runtime = server.GetRuntime();
runtime->SetLogPriority(NKikimrServices::HIVE, NActors::NLog::PRI_TRACE);
TActorId sender = runtime->AllocateEdgeActor();

Y_UNIT_TEST(ShardsLimit905) {
ShardsQuotaTest(905, 1000, 1, Ydb::Monitoring::StatusFlag::YELLOW);
}

Y_UNIT_TEST(ShardsLimit800) {
ShardsQuotaTest(805, 1000, 0, Ydb::Monitoring::StatusFlag::GREEN);
}
server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, true);
server.SetupDynamicLocalService(3, "Root");
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

Y_UNIT_TEST(ShardsNoLimit) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}
}
}
5 changes: 5 additions & 0 deletions ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec
if (tablet == nullptr) {
continue;
}
tablet->InWaitQueue = false;
if (tablet->IsAlive()) {
BLOG_D("tablet " << record.TabletId << " already alive, skipping");
continue;
Expand All @@ -258,6 +259,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec
UpdateTabletFollowersNumber(leader, db, sideEffects);
}
BootQueue.AddToWaitQueue(record); // waiting for new node
tablet->InWaitQueue = true;
continue;
}
}
Expand Down Expand Up @@ -1850,6 +1852,9 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl
if (req.GetReturnMetrics()) {
tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues());
}
if (info->InWaitQueue) {
tabletInfo.SetInWaitQueue(true);
}
if (req.GetReturnChannelHistory()) {
for (const auto& channel : info->TabletStorageInfo->Channels) {
auto& tabletChannel = *tabletInfo.AddTabletChannels();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ struct TTabletInfo {
TInstant PostponedStart;
EBalancerPolicy BalancerPolicy;
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
bool InWaitQueue = false;

TTabletInfo(ETabletRole role, THive& hive);
TTabletInfo(const TTabletInfo&) = delete;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/hive.proto
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ message TTabletInfo {
optional uint32 RestartsPerPeriod = 22;
optional uint64 LastAliveTimestamp = 23;
optional EBalancerPolicy BalancerPolicy = 24;
optional bool InWaitQueue = 25;
}

message TEvSeizeTabletsReply {
Expand Down
Loading
Loading