Skip to content

Commit ebee36a

Browse files
authored
schemeshard: reject operations with too big local tx commit (ydb-platform#6760)
Add commit redo size check for successfully ignited operations as a precaution measure to avoid infinite loop of schemeshard hitting local tx commit redo size limit, restarting, attempting to propose persisted operation again, hitting commit redo size limit again, restarting and so on. This could happen with inherently massive operations such as copy-tables used as a starting step of database export/backup. Coping large number of tables with huge number of partitions can result in so large TTxOperationPropose local transaction that its size would hit the limit imposed by the tablet executor. Tablet violating that limit is considered broken and will be immediately stopped. See ydb/core/tablet_flat/flat_executor.cpp, NTabletFlatExecutor::TExecutor::ExecuteTransaction(). KIKIMR-21751
1 parent ab98fb1 commit ebee36a

File tree

5 files changed

+232
-16
lines changed

5 files changed

+232
-16
lines changed

ydb/core/tx/schemeshard/schemeshard__operation.cpp

+110-15
Original file line numberDiff line numberDiff line change
@@ -104,14 +104,12 @@ THolder<TProposeResponse> TSchemeShard::IgniteOperation(TProposeRequest& request
104104
}
105105

106106
TOperation::TPtr operation = new TOperation(txId);
107-
Operations[txId] = operation; //record is erased at ApplyOnExecute if all parts are done at propose
108107

109108
for (const auto& transaction : record.GetTransaction()) {
110109
auto quotaResult = operation->ConsumeQuota(transaction, context);
111110
if (quotaResult.Status != NKikimrScheme::StatusSuccess) {
112111
response.Reset(new TProposeResponse(quotaResult.Status, ui64(txId), ui64(selfId)));
113112
response->SetError(quotaResult.Status, quotaResult.Reason);
114-
Operations.erase(txId);
115113
return std::move(response);
116114
}
117115
}
@@ -131,7 +129,6 @@ THolder<TProposeResponse> TSchemeShard::IgniteOperation(TProposeRequest& request
131129
if (splitResult.Status != NKikimrScheme::StatusSuccess) {
132130
response.Reset(new TProposeResponse(splitResult.Status, ui64(txId), ui64(selfId)));
133131
response->SetError(splitResult.Status, splitResult.Reason);
134-
Operations.erase(txId);
135132
return std::move(response);
136133
}
137134

@@ -140,11 +137,15 @@ THolder<TProposeResponse> TSchemeShard::IgniteOperation(TProposeRequest& request
140137

141138
const TString owner = record.HasOwner() ? record.GetOwner() : BUILTIN_ACL_ROOT;
142139

140+
bool prevProposeUndoSafe = true;
141+
142+
Operations[txId] = operation; //record is erased at ApplyOnExecute if all parts are done at propose
143+
143144
for (const auto& transaction : transactions) {
144145
auto parts = operation->ConstructParts(transaction, context);
145146

146147
if (parts.size() > 1) {
147-
// les't allow altering impl index tables as part of consistent operation
148+
// allow altering impl index tables as part of consistent operation
148149
context.IsAllowedPrivateTables = true;
149150
}
150151

@@ -198,25 +199,77 @@ THolder<TProposeResponse> TSchemeShard::IgniteOperation(TProposeRequest& request
198199
<< ", with reason: " << response->Record.GetReason()
199200
<< ", tx message: " << SecureDebugString(record));
200201

201-
context.OnComplete = {}; // recreate
202-
context.DbChanges = {};
202+
AbortOperationPropose(txId, context);
203203

204-
for (auto& toAbort : operation->Parts) {
205-
toAbort->AbortPropose(context);
206-
}
204+
return std::move(response);
205+
}
207206

208-
context.MemChanges.UnDo(context.SS);
209-
context.OnComplete.ApplyOnExecute(context.SS, context.GetTxc(), context.Ctx);
210-
Operations.erase(txId);
207+
// Check suboperations for undo safety. Log first unsafe suboperation in the schema transaction.
208+
if (prevProposeUndoSafe && !context.IsUndoChangesSafe()) {
209+
prevProposeUndoSafe = false;
211210

212-
return std::move(response);
211+
LOG_WARN_S(context.Ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
212+
"Operation part proposed ok, but propose itself is undo unsafe"
213+
<< ", suboperation type: " << NKikimrSchemeOp::EOperationType_Name(part->GetTransaction().GetOperationType())
214+
<< ", opId: " << part->GetOperationId()
215+
<< ", at schemeshard: " << selfId
216+
);
213217
}
214218
}
215219
}
216220

217221
return std::move(response);
218222
}
219223

224+
void TSchemeShard::AbortOperationPropose(const TTxId txId, TOperationContext& context) {
225+
Y_ABORT_UNLESS(Operations.contains(txId));
226+
TOperation::TPtr operation = Operations.at(txId);
227+
228+
// Drop operation side effects, undo memory changes
229+
// (Local db changes were already applied)
230+
context.OnComplete = {};
231+
context.DbChanges = {};
232+
233+
for (auto& i : operation->Parts) {
234+
i->AbortPropose(context);
235+
}
236+
237+
context.MemChanges.UnDo(context.SS);
238+
239+
// And remove aborted operation from existence
240+
Operations.erase(txId);
241+
}
242+
243+
void AbortOperation(TOperationContext& context, const TTxId txId, const TString& reason) {
244+
LOG_ERROR_S(context.Ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "TTxOperationPropose Execute"
245+
<< ", txId: " << txId
246+
<< ", operation is rejected and all changes reverted"
247+
<< ", " << reason
248+
<< ", at schemeshard: " << context.SS->SelfTabletId()
249+
);
250+
251+
context.GetTxc().DB.RollbackChanges();
252+
context.SS->AbortOperationPropose(txId, context);
253+
}
254+
255+
bool IsCommitRedoSizeOverLimit(TString* reason, TOperationContext& context) {
256+
// MaxCommitRedoMB is the ICB control shared with NTabletFlatExecutor::TExecutor.
257+
// We subtract from MaxCommitRedoMB additional 1MB for anything extra
258+
// that executor/tablet may (or may not) add under the hood
259+
const ui64 limitBytes = (context.SS->MaxCommitRedoMB - 1) << 20; // MB to bytes
260+
const ui64 commitRedoBytes = context.GetTxc().DB.GetCommitRedoBytes();
261+
if (commitRedoBytes >= limitBytes) {
262+
*reason = TStringBuilder()
263+
<< "local tx commit redo size generated by IgniteOperation() is more than allowed limit: "
264+
<< "commit redo size " << commitRedoBytes
265+
<< ", limit " << limitBytes
266+
<< ", excess " << (commitRedoBytes - limitBytes)
267+
;
268+
return true;
269+
}
270+
return false;
271+
}
272+
220273
struct TSchemeShard::TTxOperationPropose: public NTabletFlatExecutor::TTransactionBase<TSchemeShard> {
221274
using TBase = NTabletFlatExecutor::TTransactionBase<TSchemeShard>;
222275

@@ -236,6 +289,7 @@ struct TSchemeShard::TTxOperationPropose: public NTabletFlatExecutor::TTransacti
236289

237290
bool Execute(NTabletFlatExecutor::TTransactionContext& txc, const TActorContext& ctx) override {
238291
TTabletId selfId = Self->SelfTabletId();
292+
auto txId = TTxId(Request->Get()->Record.GetTxId());
239293

240294
LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
241295
"TTxOperationPropose Execute"
@@ -246,7 +300,6 @@ struct TSchemeShard::TTxOperationPropose: public NTabletFlatExecutor::TTransacti
246300

247301
auto [userToken, tokenParseError] = ParseUserToken(Request->Get()->Record.GetUserToken());
248302
if (tokenParseError) {
249-
auto txId = Request->Get()->Record.GetTxId();
250303
Response = MakeHolder<TProposeResponse>(NKikimrScheme::StatusInvalidParameter, ui64(txId), ui64(selfId), "Failed to parse user token");
251304
return true;
252305
}
@@ -258,10 +311,52 @@ struct TSchemeShard::TTxOperationPropose: public NTabletFlatExecutor::TTransacti
258311
TStorageChanges dbChanges;
259312
TOperationContext context{Self, txc, ctx, OnComplete, memChanges, dbChanges, std::move(userToken)};
260313

314+
//NOTE: Successful IgniteOperation will leave created operation in Self->Operations and accumulated changes in the context.
315+
// Unsuccessful IgniteOperation will leave no operation and context will also be clean.
261316
Response = Self->IgniteOperation(*Request->Get(), context);
262317

263-
OnComplete.ApplyOnExecute(Self, txc, ctx);
318+
//NOTE: Successfully created operation also must be checked for the size of this local tx.
319+
//
320+
// Limitation on a commit redo size of local transactions is imposed at the tablet executor level
321+
// (See ydb/core/tablet_flat/flat_executor.cpp, NTabletFlatExecutor::TExecutor::ExecuteTransaction()).
322+
// And a tablet violating that limit is considered broken and will be stopped unconditionally and immediately.
323+
//
324+
// So even if operation was ignited successfully, it's local tx size still must be checked
325+
// as a precaution measure to avoid infinite loop of schemeshard restarting, attempting to propose
326+
// persisted operation again, hitting commit redo size limit and restarting again.
327+
//
328+
// On unsuccessful check, local tx should be rolled back, operation should be rejected and
329+
// all accumulated changes dropped or reverted.
330+
//
331+
332+
// Actually build commit redo (dbChanges could be empty)
264333
dbChanges.Apply(Self, txc, ctx);
334+
335+
if (Self->Operations.contains(txId)) {
336+
Y_ABORT_UNLESS(Response->IsDone() || Response->IsAccepted() || Response->IsConditionalAccepted());
337+
338+
// Check local tx commit redo size
339+
TString reason;
340+
if (IsCommitRedoSizeOverLimit(&reason, context)) {
341+
Response = MakeHolder<TProposeResponse>(NKikimrScheme::StatusSchemeError, ui64(txId), ui64(selfId), reason);
342+
343+
AbortOperation(context, txId, reason);
344+
345+
if (!context.IsUndoChangesSafe()) {
346+
LOG_ERROR_S(context.Ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "TTxOperationPropose Execute"
347+
<< ", opId: " << txId
348+
<< ", operation should be rejected and all changes be reverted"
349+
<< ", but context.IsUndoChangesSafe is false, which means some direct writes have been done"
350+
<< ", message: " << SecureDebugString(Request->Get()->Record)
351+
<< ", at schemeshard: " << context.SS->SelfTabletId()
352+
);
353+
}
354+
}
355+
}
356+
357+
// Apply accumulated changes (changes could be empty)
358+
OnComplete.ApplyOnExecute(Self, txc, ctx);
359+
265360
return true;
266361
}
267362

ydb/core/tx/schemeshard/schemeshard_impl.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -4463,6 +4463,8 @@ void TSchemeShard::OnActivateExecutor(const TActorContext &ctx) {
44634463
appData->Icb->RegisterSharedControl(DisablePublicationsOfDropping, "SchemeShard_DisablePublicationsOfDropping");
44644464
appData->Icb->RegisterSharedControl(FillAllocatePQ, "SchemeShard_FillAllocatePQ");
44654465

4466+
appData->Icb->RegisterSharedControl(MaxCommitRedoMB, "TabletControls.MaxCommitRedoMB");
4467+
44664468
AllowDataColumnForIndexTable = appData->FeatureFlags.GetEnableDataColumnForIndexTable();
44674469
appData->Icb->RegisterSharedControl(AllowDataColumnForIndexTable, "SchemeShard_AllowDataColumnForIndexTable");
44684470

ydb/core/tx/schemeshard/schemeshard_impl.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,9 @@ class TSchemeShard
187187
TControlWrapper DisablePublicationsOfDropping;
188188
TControlWrapper FillAllocatePQ;
189189

190+
// Shared with NTabletFlatExecutor::TExecutor
191+
TControlWrapper MaxCommitRedoMB;
192+
190193
TSplitSettings SplitSettings;
191194

192195
struct TTenantInitState {
@@ -370,6 +373,8 @@ class TSchemeShard
370373
NExternalSource::IExternalSourceFactory::TPtr ExternalSourceFactory{NExternalSource::CreateExternalSourceFactory({})};
371374

372375
THolder<TProposeResponse> IgniteOperation(TProposeRequest& request, TOperationContext& context);
376+
void AbortOperationPropose(const TTxId txId, TOperationContext& context);
377+
373378
THolder<TEvDataShard::TEvProposeTransaction> MakeDataShardProposal(const TPathId& pathId, const TOperationId& opId,
374379
const TString& body, const TActorContext& ctx) const;
375380

@@ -419,7 +424,7 @@ class TSchemeShard
419424
return MakeLocalId(NextLocalPathId);
420425
}
421426

422-
TPathId AllocatePathId () {
427+
TPathId AllocatePathId() {
423428
TPathId next = PeekNextPathId();
424429
++NextLocalPathId;
425430
return next;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
#include <ydb/core/tx/schemeshard/ut_helpers/helpers.h>
2+
3+
using namespace NKikimr;
4+
using namespace NSchemeShard;
5+
using namespace NSchemeShardUT_Private;
6+
7+
Y_UNIT_TEST_SUITE(TSchemeShardCheckProposeSize) {
8+
9+
//TODO: can't check all operations as many of them do not implement
10+
// TSubOperation::AbortPropose() properly and will abort.
11+
12+
Y_UNIT_TEST(CopyTable) {
13+
TTestBasicRuntime runtime;
14+
TTestEnv env(runtime);
15+
16+
// Take control over MaxCommitRedoMB ICB setting.
17+
// Drop down its min-value limit to be able to set it as low as test needs.
18+
TControlWrapper MaxCommitRedoMB;
19+
{
20+
runtime.GetAppData().Icb->RegisterSharedControl(MaxCommitRedoMB, "TabletControls.MaxCommitRedoMB");
21+
MaxCommitRedoMB.Reset(200, 1, 4096);
22+
}
23+
24+
ui64 txId = 100;
25+
26+
TestCreateTable(runtime, ++txId, "/MyRoot", R"(
27+
Name: "table"
28+
Columns { Name: "key" Type: "Uint64"}
29+
Columns { Name: "value" Type: "Utf8"}
30+
KeyColumnNames: ["key"]
31+
)");
32+
env.TestWaitNotification(runtime, txId);
33+
34+
// 1. Set MaxCommitRedoMB to 1 and try to create table.
35+
//
36+
// (Check at the operation's Propose tests commit redo size against (MaxCommitRedoMB - 1)
37+
// to give 1MB leeway to executer/tablet inner stuff to may be do "something extra".
38+
// So MaxCommitRedoMB = 1 means effective 0 for the size of operation's commit.)
39+
{
40+
MaxCommitRedoMB = 1;
41+
AsyncCopyTable(runtime, ++txId, "/MyRoot", "table-copy", "/MyRoot/table");
42+
TestModificationResults(runtime, txId,
43+
{{NKikimrScheme::StatusSchemeError, "local tx commit redo size generated by IgniteOperation() is more than allowed limit"}}
44+
);
45+
env.TestWaitNotification(runtime, txId);
46+
}
47+
48+
// 2. Set MaxCommitRedoMB back to high value and try again.
49+
{
50+
MaxCommitRedoMB = 200;
51+
AsyncCopyTable(runtime, ++txId, "/MyRoot", "table-copy", "/MyRoot/table");
52+
env.TestWaitNotification(runtime, txId);
53+
}
54+
}
55+
56+
Y_UNIT_TEST(CopyTables) {
57+
TTestBasicRuntime runtime;
58+
TTestEnv env(runtime);
59+
60+
// Take control over MaxCommitRedoMB ICB setting.
61+
// Drop down its min-value limit to be able to set it as low as test needs.
62+
TControlWrapper MaxCommitRedoMB;
63+
{
64+
runtime.GetAppData().Icb->RegisterSharedControl(MaxCommitRedoMB, "TabletControls.MaxCommitRedoMB");
65+
MaxCommitRedoMB.Reset(200, 1, 4096);
66+
}
67+
68+
const ui64 tables = 100;
69+
const ui64 shardsPerTable = 1;
70+
71+
ui64 txId = 100;
72+
73+
for (ui64 i : xrange(tables)) {
74+
TestCreateTable(runtime, ++txId, "/MyRoot", Sprintf(
75+
R"(
76+
Name: "table-%lu"
77+
Columns { Name: "key" Type: "Uint64"}
78+
Columns { Name: "value" Type: "Utf8"}
79+
KeyColumnNames: ["key"]
80+
UniformPartitionsCount: %lu
81+
)",
82+
i,
83+
shardsPerTable
84+
));
85+
env.TestWaitNotification(runtime, txId);
86+
}
87+
88+
auto testCopyTables = [](auto& runtime, ui64 txId, ui64 tables) {
89+
TVector<TEvTx*> schemeTxs;
90+
for (ui64 i : xrange(tables)) {
91+
schemeTxs.push_back(CopyTableRequest(txId, "/MyRoot", Sprintf("table-%lu-copy", i), Sprintf("/MyRoot/table-%lu", i)));
92+
}
93+
AsyncSend(runtime, TTestTxConfig::SchemeShard, CombineSchemeTransactions(schemeTxs));
94+
};
95+
96+
// 1. Set MaxCommitRedoMB to 1 and try to copy tables.
97+
{
98+
MaxCommitRedoMB = 1;
99+
testCopyTables(runtime, ++txId, tables);
100+
TestModificationResults(runtime, txId,
101+
{{NKikimrScheme::StatusSchemeError, "local tx commit redo size generated by IgniteOperation() is more than allowed limit"}}
102+
);
103+
}
104+
105+
// 2. Set MaxCommitRedoMB back to high value and try again.
106+
{
107+
MaxCommitRedoMB = 200;
108+
testCopyTables(runtime, ++txId, tables);
109+
TestModificationResults(runtime, txId, {{NKikimrScheme::StatusAccepted}});
110+
}
111+
}
112+
113+
}

ydb/core/tx/schemeshard/ut_base/ya.make

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ SRCS(
2727
ut_base.cpp
2828
ut_info_types.cpp
2929
ut_table_pg_types.cpp
30+
ut_commit_redo_limit.cpp
3031
)
3132

3233
END()

0 commit comments

Comments
 (0)