Skip to content

BTreeIndex Split Flush method, use bigger resolution #3182

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 51 additions & 42 deletions ydb/core/tablet_flat/flat_page_btree_index_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,23 +386,35 @@ namespace NKikimr::NTable::NPage {

Levels[0].PushChild(child);
}

void Flush(IPageWriter &pager) {
for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) {
bool hasChanges = false;

// Note: in theory we may want to flush one level multiple times when different triggers are applicable
while (CanFlush(levelIndex)) {
DoFlush(levelIndex, pager, false);
hasChanges = true;
}

std::optional<TBtreeIndexMeta> Flush(IPageWriter &pager, bool last) {
Y_ABORT_UNLESS(Levels.size() < Max<ui32>(), "Levels size is out of bounds");
if (!hasChanges) {
break; // no more changes
}
}
}

TBtreeIndexMeta Finish(IPageWriter &pager) {
for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) {
if (last && !Levels[levelIndex].GetKeysCount()) {
if (!Levels[levelIndex].GetKeysCount()) {
Y_ABORT_UNLESS(Levels[levelIndex].GetChildrenCount() == 1, "Should be root");
return TBtreeIndexMeta{ Levels[levelIndex].PopChild(), levelIndex, IndexSize };
Y_ABORT_UNLESS(levelIndex + 1 == Levels.size(), "Should be root");
return {Levels[levelIndex].PopChild(), levelIndex, IndexSize};
}

if (!TryFlush(levelIndex, pager, last)) {
Y_ABORT_UNLESS(!last);
break;
}
DoFlush(levelIndex, pager, true);
}

Y_ABORT_UNLESS(!last, "Should have returned root");
return { };
Y_ABORT_UNLESS(false, "Should have returned root");
}

void Reset() {
Expand All @@ -415,43 +427,41 @@ namespace NKikimr::NTable::NPage {
}

private:
bool TryFlush(ui32 levelIndex, IPageWriter &pager, bool last) {
if (!last && Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMax) {
// Note: node should meet both NodeKeysMin and NodeSize restrictions for split
bool CanFlush(ui32 levelIndex) {
const ui64 waitFullNodes = 2;

if (Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMin) {
// not enough keys for split
return false;
}

// Note: this size check is approximate and we might not produce 2 full-sized pages
if (CalcPageSize(Levels[levelIndex]) <= 2 * NodeTargetSize) {
// not enough bytes for split
return false;
}
if (Levels[levelIndex].GetKeysCount() <= waitFullNodes * NodeKeysMin) {
// node keys min restriction should be always satisfied
return false;
}

Writer.EnsureEmpty();
// Note: size checks are approximate and flush might not produce 2 full-sized pages

// Note: for now we build last nodes from all remaining level's keys
// we may to try splitting them more evenly later
return
Levels[levelIndex].GetKeysCount() > waitFullNodes * NodeKeysMax ||
CalcPageSize(Levels[levelIndex]) > waitFullNodes * NodeTargetSize;
}

while (last || Writer.GetKeysCount() < NodeKeysMin || Writer.CalcPageSize() < NodeTargetSize) {
if (!last && Levels[levelIndex].GetKeysCount() < 3) {
// we shouldn't produce empty nodes (but can violate NodeKeysMin restriction)
break;
}
if (!last && Writer.GetKeysCount() >= NodeKeysMax) {
// have enough keys
break;
void DoFlush(ui32 levelIndex, IPageWriter &pager, bool last) {
Writer.EnsureEmpty();

if (last) {
// Note: for now we build last nodes from all remaining level's keys
// we may to try splitting them more evenly later

while (Levels[levelIndex].GetKeysCount()) {
Writer.AddChild(Levels[levelIndex].PopChild());
Writer.AddKey(Levels[levelIndex].PopKey());
}
if (last && !Levels[levelIndex].GetKeysCount()) {
// nothing left
break;
} else {
while (Writer.GetKeysCount() < NodeKeysMin || (
// can add more to writer if:
Levels[levelIndex].GetKeysCount() > 2 &&
Writer.GetKeysCount() < NodeKeysMax &&
Writer.CalcPageSize() < NodeTargetSize)) {
Writer.AddChild(Levels[levelIndex].PopChild());
Writer.AddKey(Levels[levelIndex].PopKey());
}

Writer.AddChild(Levels[levelIndex].PopChild());
Writer.AddKey(Levels[levelIndex].PopKey());
}
auto lastChild = Levels[levelIndex].PopChild();
Writer.AddChild(lastChild);
Expand All @@ -462,6 +472,7 @@ namespace NKikimr::NTable::NPage {

if (levelIndex + 1 == Levels.size()) {
Levels.emplace_back();
Y_ABORT_UNLESS(Levels.size() < Max<ui32>(), "Levels size is out of bounds");
}
Levels[levelIndex + 1].PushChild(TChild{pageId, lastChild.RowCount, lastChild.DataSize, lastChild.ErasedRowCount});
if (!last) {
Expand All @@ -475,8 +486,6 @@ namespace NKikimr::NTable::NPage {
} else {
Y_ABORT_UNLESS(Levels[levelIndex].GetKeysCount(), "Shouldn't leave empty levels");
}

return true;
}

size_t CalcPageSize(const TLevel& level) const {
Expand Down
6 changes: 3 additions & 3 deletions ydb/core/tablet_flat/flat_part_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -532,12 +532,12 @@ namespace NTable {
if (WriteBTreeIndex) {
Current.BTreeGroupIndexes.reserve(Groups.size());
for (auto& g : Groups) {
Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value());
Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Finish(Pager));
}
if (Current.HistoryWritten > 0) {
Current.BTreeHistoricIndexes.reserve(Histories.size());
for (auto& g : Histories) {
Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value());
Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Finish(Pager));
}
}
}
Expand Down Expand Up @@ -807,7 +807,7 @@ namespace NTable {
} else {
g.BTreeIndex.AddShortChild({page, dataPage->Count, raw.size()});
}
g.BTreeIndex.Flush(Pager, false);
g.BTreeIndex.Flush(Pager);
}

// N.B. hack to save the last row/key for the main group
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/tablet_flat/flat_stat_part.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class TStatsScreenedPartIterator {
for (bool historic : {false, true}) {
for (ui32 groupIndex : xrange(historic ? Part->HistoricGroupsCount : Part->GroupsCount)) {
ui64 groupRowCountResolution, groupDataSizeResolution;
if (groupIndex == 0 && Part->GroupsCount > 1) {
if (groupIndex == 0 && (Part->GroupsCount > 1 || Small || Large)) {
// make steps as small as possible because they will affect groups resolution
groupRowCountResolution = groupDataSizeResolution = 0;
} else {
Expand Down
10 changes: 2 additions & 8 deletions ydb/core/tablet_flat/flat_stat_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,14 @@ bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, u
TDataStats iteratorStats = { };
TStatsIterator statsIterator(subset.Scheme->Keys);

TSet<TEpoch> epochs;
for (const auto& part : subset.Flatten) {
epochs.insert(part->Epoch);
}
// if rowCountResolution = 300, 3-leveled SST, let's move each iterator up to 25 rows
ui64 iterRowCountResolution = rowCountResolution / Max<ui64>(1, epochs.size()) / 4;
ui64 iterDataSizeResolution = dataSizeResolution / Max<ui64>(1, epochs.size()) / 4;
// TODO: deal with resolution

// Make index iterators for all parts
bool started = true;
for (const auto& part : subset.Flatten) {
stats.IndexSize.Add(part->IndexesRawSize, part->Label.Channel());
TAutoPtr<TStatsScreenedPartIterator> iter = new TStatsScreenedPartIterator(part, env, subset.Scheme->Keys, part->Small, part->Large,
iterRowCountResolution, iterDataSizeResolution);
rowCountResolution, dataSizeResolution);
auto ready = iter->Start();
if (ready == EReady::Page) {
started = false;
Expand Down
49 changes: 22 additions & 27 deletions ydb/core/tablet_flat/ut/ut_btree_index_nodes.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "flat_page_btree_index.h"
#include "flat_page_btree_index_writer.h"
#include "test/libs/table/test_writer.h"
#include "ydb/core/tx/datashard/datashard.h"
#include <ydb/core/tablet_flat/test/libs/rows/layout.h>
#include <library/cpp/testing/unittest/registar.h>

Expand Down Expand Up @@ -505,11 +506,10 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
builder.AddChild(child);

TWriterBundle pager(1, TLogoBlobID());
auto result = builder.Flush(pager, true);
UNIT_ASSERT(result);
auto result = builder.Finish(pager);

TBtreeIndexMeta expected{child, 0, 0};
UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString());
UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString());
}

Y_UNIT_TEST(OneNode) {
Expand All @@ -536,15 +536,14 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
}

TWriterBundle pager(1, TLogoBlobID());
auto result = builder.Flush(pager, true);
UNIT_ASSERT(result);
auto result = builder.Finish(pager);

Dump(*result, builder.GroupInfo, pager.Back());
Dump(result, builder.GroupInfo, pager.Back());

TBtreeIndexMeta expected{{0, 1155, 11055, 385}, 1, 595};
UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString());
UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString());

CheckKeys(result->PageId, keys, builder.GroupInfo, pager.Back());
CheckKeys(result.PageId, keys, builder.GroupInfo, pager.Back());
}

Y_UNIT_TEST(FewNodes) {
Expand All @@ -569,16 +568,21 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
TSerializedCellVec deserialized(keys[i]);
builder.AddKey(deserialized.GetCells());
builder.AddChild(children[i + 1]);
UNIT_ASSERT(!builder.Flush(pager, false));
builder.Flush(pager);
}

auto result = builder.Flush(pager, true);
UNIT_ASSERT(result);
auto result = builder.Finish(pager);

Dump(*result, builder.GroupInfo, pager.Back());

UNIT_ASSERT_VALUES_EQUAL(result->LevelCount, 3);
Dump(result, builder.GroupInfo, pager.Back());

TBtreeIndexMeta expected{{9, 0, 0, 0}, 3, 1550};
for (auto c : children) {
expected.RowCount += c.RowCount;
expected.DataSize += c.DataSize;
expected.ErasedRowCount += c.ErasedRowCount;
}
UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString());

auto checkKeys = [&](TPageId pageId, const TVector<TString>& keys) {
CheckKeys(pageId, keys, builder.GroupInfo, pager.Back());
};
Expand Down Expand Up @@ -624,14 +628,6 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
checkKeys(9, {
keys[8]
});

TBtreeIndexMeta expected{{9, 0, 0, 0}, 3, 1550};
for (auto c : children) {
expected.RowCount += c.RowCount;
expected.DataSize += c.DataSize;
expected.ErasedRowCount += c.ErasedRowCount;
}
UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString());
}

Y_UNIT_TEST(SplitBySize) {
Expand All @@ -656,16 +652,15 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
TSerializedCellVec deserialized(keys[i]);
builder.AddKey(deserialized.GetCells());
builder.AddChild(children[i + 1]);
UNIT_ASSERT(!builder.Flush(pager, false));
builder.Flush(pager);
}

auto result = builder.Flush(pager, true);
UNIT_ASSERT(result);
auto result = builder.Finish(pager);

Dump(*result, builder.GroupInfo, pager.Back());
Dump(result, builder.GroupInfo, pager.Back());

TBtreeIndexMeta expected{{15, 15150, 106050, 8080}, 3, 10270};
UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString());
UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString());
}

}
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/tablet_flat/ut/ut_stat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ Y_UNIT_TEST_SUITE(BuildStats) {
{
auto subset = TMake(Mass1, PageConf(Mass1.Model->Scheme->Families.size(), true, true)).Mixed(0, 1, TMixerOne{ }, 0.3, 13);
subset->Flatten.begin()->Slices->Describe(Cerr); Cerr << Endl;
Check(*subset, 13570, 2186460 /* ~2277890 */, 42292, 5310, 531050);
Check(*subset, 13570, 2114857 /* ~2277890 */, 42292, 5310, 531050);
}
}

Expand Down
2 changes: 1 addition & 1 deletion ydb/core/tx/datashard/datashard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ using namespace NSchemeShard;
using namespace NTabletFlatExecutor;

// NOTE: We really want to batch log records by default in datashards!
// But in unittests we want to test both scenarios
// But in unit tests we want to test both scenarios
bool gAllowLogBatchingDefaultValue = true;

TDuration gDbStatsReportInterval = TDuration::Seconds(10);
Expand Down
Loading