Skip to content

Add disable evict vdisks option to config #9812

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ydb/core/cms/cms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,12 @@ bool TCms::CheckEvictVDisks(const TAction &action, TErrorInfo &error) const {
return false;
}

if (State->Config.SentinelConfig.EvictVDisksStatus.Empty()) {
error.Code = TStatus::ERROR;
error.Reason = "Evict vdisks is disabled in Sentinel (self heal)";
return false;
}

switch (action.GetType()) {
case TAction::RESTART_SERVICES:
case TAction::SHUTDOWN_HOST:
Expand Down
40 changes: 40 additions & 0 deletions ydb/core/cms/cms_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1853,6 +1853,46 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
env.CheckDonePermission("user", permission2.GetPermissions(0).GetId());
}

Y_UNIT_TEST(DisabledEvictVDisks)
{
auto opts = TTestEnvOpts(8).WithSentinel();
TCmsTestEnv env(opts);
env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_DEBUG);

// Make transition faster for tests purposes
auto cmsConfig = env.GetCmsConfig();
cmsConfig.MutableSentinelConfig()->SetDefaultStateLimit(1);
env.SetCmsConfig(cmsConfig);

// Evict VDisks
auto request = env.CheckPermissionRequest(
MakePermissionRequest(TRequestOptions("user").WithEvictVDisks(),
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage")
),
TStatus::DISALLOW_TEMP // ok, waiting for move VDisks
);

// Check that FAULTY BSC request is sent
env.CheckBSCUpdateRequests({ env.GetNodeId(0) }, NKikimrBlobStorage::FAULTY);

// Disable VDisks eviction
cmsConfig.MutableSentinelConfig()->SetEvictVDisksStatus(NKikimrCms::TCmsConfig::TSentinelConfig::DISABLED);
env.SetCmsConfig(cmsConfig);

// Check that ACTIVE BSC request is sent
env.CheckBSCUpdateRequests({ env.GetNodeId(0) }, NKikimrBlobStorage::ACTIVE);

// Check that CMS returns ERROR when VDisks eviction is disabled
env.CheckRequest("user", request.GetRequestId(), false, TStatus::ERROR, 0);

// Enable VDisks eviction again
cmsConfig.MutableSentinelConfig()->SetEvictVDisksStatus(NKikimrCms::TCmsConfig::TSentinelConfig::FAULTY);
env.SetCmsConfig(cmsConfig);

// Check that FAULTY BSC request is sent again
env.CheckBSCUpdateRequests({ env.GetNodeId(0) }, NKikimrBlobStorage::FAULTY);
}

Y_UNIT_TEST(EmergencyDuringRollingRestart)
{
TCmsTestEnv env(8);
Expand Down
32 changes: 32 additions & 0 deletions ydb/core/cms/config.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#pragma once

#include "pdisk_state.h"
#include "pdisk_status.h"

#include <ydb/core/protos/cms.pb.h>

#include <util/datetime/base.h>
#include <util/generic/hash.h>
#include <util/generic/map.h>
#include <util/generic/maybe.h>

namespace NKikimr::NCms {

Expand All @@ -30,6 +32,8 @@ struct TCmsSentinelConfig {
ui32 RoomRatio;
ui32 RackRatio;

TMaybeFail<EPDiskStatus> EvictVDisksStatus;

void Serialize(NKikimrCms::TCmsConfig::TSentinelConfig &config) const {
config.SetEnable(Enable);
config.SetDryRun(DryRun);
Expand All @@ -45,6 +49,7 @@ struct TCmsSentinelConfig {
config.SetRackRatio(RackRatio);

SaveStateLimits(config);
SaveEvictVDisksStatus(config);
}

void Deserialize(const NKikimrCms::TCmsConfig::TSentinelConfig &config) {
Expand All @@ -63,6 +68,8 @@ struct TCmsSentinelConfig {

auto newStateLimits = LoadStateLimits(config);
StateLimits.swap(newStateLimits);

EvictVDisksStatus = LoadEvictVDisksStatus(config);
}

void SaveStateLimits(NKikimrCms::TCmsConfig::TSentinelConfig &config) const {
Expand Down Expand Up @@ -129,6 +136,31 @@ struct TCmsSentinelConfig {

return stateLimits;
}

static TMaybeFail<EPDiskStatus> LoadEvictVDisksStatus(const NKikimrCms::TCmsConfig::TSentinelConfig &config) {
using EEvictVDisksStatus = NKikimrCms::TCmsConfig::TSentinelConfig;
switch (config.GetEvictVDisksStatus()) {
case EEvictVDisksStatus::UNKNOWN:
case EEvictVDisksStatus::FAULTY:
return EPDiskStatus::FAULTY;
case EEvictVDisksStatus::DISABLED:
return Nothing();
}
return EPDiskStatus::FAULTY;
}

void SaveEvictVDisksStatus(NKikimrCms::TCmsConfig::TSentinelConfig &config) const {
using EEvictVDisksStatus = NKikimrCms::TCmsConfig::TSentinelConfig;

if (EvictVDisksStatus.Empty()) {
config.SetEvictVDisksStatus(EEvictVDisksStatus::DISABLED);
return;
}

if (*EvictVDisksStatus == EPDiskStatus::FAULTY) {
config.SetEvictVDisksStatus(EEvictVDisksStatus::FAULTY);
}
}
};

struct TCmsLogConfig {
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/cms/sentinel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -895,8 +895,8 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
continue;
}

if (it->second.HasFaultyMarker()) {
info.SetForcedStatus(EPDiskStatus::FAULTY);
if (it->second.HasFaultyMarker() && Config.EvictVDisksStatus.Defined()) {
info.SetForcedStatus(*Config.EvictVDisksStatus);
} else {
info.ResetForcedStatus();
}
Expand Down
7 changes: 7 additions & 0 deletions ydb/core/protos/cms.proto
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,12 @@ message TCmsConfig {
optional uint32 Limit = 2;
}

enum EEvictVDisksStatus {
UNKNOWN = 0;
DISABLED = 1;
FAULTY = 2;
}

optional bool Enable = 1 [default = true];
// Updater's config
optional uint64 UpdateConfigInterval = 2 [default = 3600000000];
Expand All @@ -433,6 +439,7 @@ message TCmsConfig {

optional bool DryRun = 13;
repeated TStateLimit StateLimits = 14;
optional EEvictVDisksStatus EvictVDisksStatus = 15;
}

message TLogConfig {
Expand Down
Loading