Skip to content

Commit 4428e19

Browse files
committed
.
1 parent 356726c commit 4428e19

File tree

4 files changed

+35
-9
lines changed

4 files changed

+35
-9
lines changed

ya.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build_cache = true
44
build_cache_conf = ['cas_logging=true', 'graph_info=true']
55
build_cache_master = true
66
cache_codec = ''
7-
cache_size = 150374182400
7+
cache_size = 300748364800
88
content_uids = true
99
dir_outputs = true
1010
dir_outputs_test_mode = true

ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def add_options(p):
1818
p.add_argument('--enable-pdisk-encryption-keys-changes', action='store_true', help='Enable changes of PDisk encryption keys')
1919
p.add_argument('--enable-kill-tablets', action='store_true', help='Enable tablet killer')
2020
p.add_argument('--enable-kill-blob-depot', action='store_true', help='Enable BlobDepot killer')
21+
p.add_argument('--enable-restart-pdisks', action='store_true', help='Enable PDisk restarter')
2122
p.add_argument('--kill-signal', type=str, default='KILL', help='Kill signal to send to restart node')
2223

2324

@@ -144,6 +145,19 @@ def do_restart(node_id):
144145
if args.enable_pdisk_encryption_keys_changes:
145146
remove_old_pdisk_keys(pdisk_keys, pdisk_key_versions, node_id)
146147

148+
def do_restart_pdisk(node_id, pdisk_id):
149+
assert can_act_on_vslot(node_id, pdisk_id)
150+
request = common.kikimr_bsconfig.TConfigRequest(IgnoreDegradedGroupsChecks=True)
151+
cmd = request.Command.add().RestartPDisk
152+
cmd.TargetPDiskId.NodeId = node_id
153+
cmd.TargetPDiskId.PDiskId = pdisk_id
154+
try:
155+
response = common.invoke_bsc_request(request)
156+
except Exception as e:
157+
raise Exception('failed to perform restart request: %s' % e)
158+
if not response.Success:
159+
raise Exception('Unexpected error from BSC: %s' % response.ErrorDescription)
160+
147161
def do_evict(vslot_id):
148162
assert can_act_on_vslot(*vslot_id)
149163
try:
@@ -230,13 +244,16 @@ def do_kill_blob_depot():
230244
wipes = []
231245
readonlies = []
232246
unreadonlies = []
247+
pdisk_restarts = []
233248

234249
for vslot in base_config.VSlot:
235250
if common.is_dynamic_group(vslot.GroupId):
236251
vslot_id = common.get_vslot_id(vslot.VSlotId)
237252
vdisk_id = '[%08x:%d:%d:%d]' % (vslot.GroupId, vslot.FailRealmIdx, vslot.FailDomainIdx, vslot.VDiskIdx)
238253
if vslot_id in vslot_readonly and not args.disable_readonly:
239254
unreadonlies.append(('un-readonly vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_readonly, vslot, False)))
255+
if can_act_on_vslot(*vslot_id[:2]) and args.enable_restart_pdisks:
256+
pdisk_restarts.append(('restart pdisk node_id: %d, pdisk_id: %d' % vslot_id[:2], (do_restart_pdisk, *vslot_id[:2])))
240257
if can_act_on_vslot(*vslot_id) and (recent_restarts or args.disable_restarts):
241258
if not args.disable_evicts:
242259
evicts.append(('evict vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_evict, vslot_id)))
@@ -258,6 +275,8 @@ def pick(v):
258275
possible_actions.append(('readonly', (pick, readonlies)))
259276
if unreadonlies:
260277
possible_actions.append(('un-readonly', (pick, unreadonlies)))
278+
if pdisk_restarts:
279+
possible_actions.append(('restart-pdisk', (pick, pdisk_restarts)))
261280

262281
restarts = []
263282

ydb/apps/dstool/lib/dstool_cmd_group_virtual_create.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def do(args):
6767
names_remaining.remove(group.VirtualGroupInfo.Name)
6868
elif group.VirtualGroupInfo.State == common.EVirtualGroupState.CREATE_FAILED:
6969
names_remaining.remove(group.VirtualGroupInfo.Name)
70-
errors.append(f'{group.VirtualGroupInfo.Name}: {group.ErrorReason}')
70+
errors.append(f'{group.VirtualGroupInfo.Name}: {group.VirtualGroupInfo.ErrorReason}')
7171

7272
if names_remaining:
7373
time.sleep(1)

ydb/core/blobstorage/nodewarden/node_warden_pdisk.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -273,23 +273,30 @@ namespace NKikimr::NStorage {
273273
void TNodeWarden::DoRestartLocalPDisk(const NKikimrBlobStorage::TNodeWardenServiceSet::TPDisk& pdisk) {
274274
ui32 pdiskId = pdisk.GetPDiskID();
275275

276-
const TActorId actorId = MakeBlobStoragePDiskID(LocalNodeId, pdiskId);
276+
STLOG(PRI_NOTICE, BS_NODE, NW75, "DoRestartLocalPDisk", (PDiskId, pdiskId));
277+
278+
const auto [_, inserted] = PDiskRestartInFlight.emplace(pdiskId);
279+
280+
if (!inserted) {
281+
STLOG(PRI_NOTICE, BS_NODE, NW76, "Restart already in progress", (PDiskId, pdiskId));
282+
// Restart is already in progress.
283+
return;
284+
}
277285

278286
auto it = LocalPDisks.find(TPDiskKey(LocalNodeId, pdiskId));
279287
if (it == LocalPDisks.end()) {
288+
PDiskRestartInFlight.erase(pdiskId);
289+
290+
STLOG(PRI_NOTICE, BS_NODE, NW77, "Restart state carried from previous start, just starting", (PDiskId, pdiskId));
291+
280292
// This can happen if warden didn't handle pdisk's restart before node's restart.
281293
// In this case, PDisk has EntityStatus::RESTART instead of EntityStatus::INITIAL.
282294
StartLocalPDisk(pdisk);
283295
SendPDiskReport(pdiskId, NKikimrBlobStorage::TEvControllerNodeReport::PD_RESTARTED);
284296
return;
285297
}
286298

287-
const auto [_, inserted] = PDiskRestartInFlight.emplace(pdiskId);
288-
289-
if (!inserted) {
290-
// Restart is already in progress.
291-
return;
292-
}
299+
const TActorId actorId = MakeBlobStoragePDiskID(LocalNodeId, pdiskId);
293300

294301
TIntrusivePtr<TPDiskConfig> pdiskConfig = CreatePDiskConfig(it->second.Record);
295302

0 commit comments

Comments
 (0)