Skip to content

Commit 07a3ac8

Browse files
authored
Merge e371a01 into 8418282
2 parents 8418282 + e371a01 commit 07a3ac8

10 files changed

+177
-93
lines changed

ydb/core/tablet_flat/flat_page_btree_index_writer.h

+89-42
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,8 @@ namespace NKikimr::NTable::NPage {
319319
Y_ABORT_UNLESS(Children);
320320
TChild result = Children.front();
321321
Children.pop_front();
322+
PrevDataSize = result.DataSize;
323+
PrevRowCount = result.RowCount;
322324
return result;
323325
}
324326

@@ -334,15 +336,41 @@ namespace NKikimr::NTable::NPage {
334336
return Children.size();
335337
}
336338

339+
ui64 GetPrevRowCount() const {
340+
return PrevRowCount;
341+
}
342+
343+
ui64 GetPrevDataSize() const {
344+
return PrevDataSize;
345+
}
346+
347+
ui64 GetNextRowCount(ui64 prevRowCount) const {
348+
return Children[1].RowCount - prevRowCount;
349+
}
350+
351+
ui64 GetNextDataSize(ui64 prevDataSize) const {
352+
return Children[1].DataSize - prevDataSize;
353+
}
354+
355+
ui64 GetRowCount() const {
356+
return Children.back().RowCount - PrevRowCount;
357+
}
358+
359+
ui64 GetDataSize() const {
360+
return Children.back().DataSize - PrevDataSize;
361+
}
362+
337363
private:
338364
size_t KeysSize = 0;
365+
ui64 PrevRowCount = 0;
366+
ui64 PrevDataSize = 0;
339367
TDeque<TString> Keys;
340368
TDeque<TChild> Children;
341369
};
342370

343371
public:
344372
TBtreeIndexBuilder(TIntrusiveConstPtr<TPartScheme> scheme, TGroupId groupId,
345-
ui32 nodeTargetSize, ui32 nodeKeysMin, ui32 nodeKeysMax)
373+
ui32 nodeTargetSize, ui32 nodeKeysMin, ui32 nodeKeysMax, ui32 leafDataSizeMax, ui32 leafRowsCountMax)
346374
: Scheme(std::move(scheme))
347375
, GroupId(groupId)
348376
, GroupInfo(Scheme->GetLayout(groupId))
@@ -351,6 +379,8 @@ namespace NKikimr::NTable::NPage {
351379
, NodeTargetSize(nodeTargetSize)
352380
, NodeKeysMin(nodeKeysMin)
353381
, NodeKeysMax(nodeKeysMax)
382+
, LeafDataSizeMax(leafDataSizeMax)
383+
, LeafRowsCountMax(leafRowsCountMax)
354384
{
355385
Y_ABORT_UNLESS(NodeTargetSize > 0);
356386
Y_ABORT_UNLESS(NodeKeysMin > 0);
@@ -386,23 +416,35 @@ namespace NKikimr::NTable::NPage {
386416

387417
Levels[0].PushChild(child);
388418
}
419+
420+
void Flush(IPageWriter &pager) {
421+
for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) {
422+
bool hasChanges = false;
423+
424+
// Note: in theory we may want to flush one level multiple times when different triggers are applicable
425+
while (CanFlush(levelIndex)) {
426+
DoFlush(levelIndex, pager, false);
427+
hasChanges = true;
428+
}
429+
430+
if (!hasChanges) {
431+
break; // no more changes
432+
}
433+
}
434+
}
389435

390-
std::optional<TBtreeIndexMeta> Flush(IPageWriter &pager, bool last) {
391-
Y_ABORT_UNLESS(Levels.size() < Max<ui32>(), "Levels size is out of bounds");
436+
TBtreeIndexMeta Finish(IPageWriter &pager) {
392437
for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) {
393-
if (last && !Levels[levelIndex].GetKeysCount()) {
438+
if (!Levels[levelIndex].GetKeysCount()) {
394439
Y_ABORT_UNLESS(Levels[levelIndex].GetChildrenCount() == 1, "Should be root");
395-
return TBtreeIndexMeta{ Levels[levelIndex].PopChild(), levelIndex, IndexSize };
440+
Y_ABORT_UNLESS(levelIndex + 1 == Levels.size(), "Should be root");
441+
return {Levels[levelIndex].PopChild(), levelIndex, IndexSize};
396442
}
397443

398-
if (!TryFlush(levelIndex, pager, last)) {
399-
Y_ABORT_UNLESS(!last);
400-
break;
401-
}
444+
DoFlush(levelIndex, pager, true);
402445
}
403446

404-
Y_ABORT_UNLESS(!last, "Should have returned root");
405-
return { };
447+
Y_ABORT_UNLESS(false, "Should have returned root");
406448
}
407449

408450
void Reset() {
@@ -415,43 +457,47 @@ namespace NKikimr::NTable::NPage {
415457
}
416458

417459
private:
418-
bool TryFlush(ui32 levelIndex, IPageWriter &pager, bool last) {
419-
if (!last && Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMax) {
420-
// Note: node should meet both NodeKeysMin and NodeSize restrictions for split
460+
bool CanFlush(ui32 levelIndex) {
461+
const ui64 waitFullNodes = 2;
421462

422-
if (Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMin) {
423-
// not enough keys for split
424-
return false;
425-
}
426-
427-
// Note: this size check is approximate and we might not produce 2 full-sized pages
428-
if (CalcPageSize(Levels[levelIndex]) <= 2 * NodeTargetSize) {
429-
// not enough bytes for split
430-
return false;
431-
}
463+
if (Levels[levelIndex].GetKeysCount() <= waitFullNodes * NodeKeysMin) {
464+
// node keys min restriction should be always satisfied
465+
return false;
432466
}
433467

468+
// Note: size checks are approximate and flush might not produce 2 full-sized pages
469+
470+
return
471+
Levels[levelIndex].GetKeysCount() > waitFullNodes * NodeKeysMax ||
472+
CalcPageSize(Levels[levelIndex]) > waitFullNodes * NodeTargetSize ||
473+
levelIndex == 0 && Levels[levelIndex].GetDataSize() > waitFullNodes * LeafDataSizeMax ||
474+
levelIndex == 0 && Levels[levelIndex].GetRowCount() > waitFullNodes * LeafRowsCountMax;
475+
}
476+
477+
void DoFlush(ui32 levelIndex, IPageWriter &pager, bool last) {
434478
Writer.EnsureEmpty();
479+
auto prevDataSize = Levels[levelIndex].GetPrevDataSize();
480+
auto prevRowCount = Levels[levelIndex].GetPrevRowCount();
435481

436-
// Note: for now we build last nodes from all remaining level's keys
437-
// we may to try splitting them more evenly later
482+
if (last) {
483+
// Note: for now we build last nodes from all remaining level's keys
484+
// we may to try splitting them more evenly later
438485

439-
while (last || Writer.GetKeysCount() < NodeKeysMin || Writer.CalcPageSize() < NodeTargetSize) {
440-
if (!last && Levels[levelIndex].GetKeysCount() < 3) {
441-
// we shouldn't produce empty nodes (but can violate NodeKeysMin restriction)
442-
break;
486+
while (Levels[levelIndex].GetKeysCount()) {
487+
Writer.AddChild(Levels[levelIndex].PopChild());
488+
Writer.AddKey(Levels[levelIndex].PopKey());
443489
}
444-
if (!last && Writer.GetKeysCount() >= NodeKeysMax) {
445-
// have enough keys
446-
break;
447-
}
448-
if (last && !Levels[levelIndex].GetKeysCount()) {
449-
// nothing left
450-
break;
490+
} else {
491+
while (Writer.GetKeysCount() < NodeKeysMin || (
492+
// can add more to writer if:
493+
Levels[levelIndex].GetKeysCount() > 2 &&
494+
Writer.GetKeysCount() < NodeKeysMax &&
495+
Writer.CalcPageSize() < NodeTargetSize &&
496+
(levelIndex != 0 || Levels[levelIndex].GetNextDataSize(prevDataSize) < LeafDataSizeMax) &&
497+
(levelIndex != 0 || Levels[levelIndex].GetNextRowCount(prevRowCount) < LeafRowsCountMax))) {
498+
Writer.AddChild(Levels[levelIndex].PopChild());
499+
Writer.AddKey(Levels[levelIndex].PopKey());
451500
}
452-
453-
Writer.AddChild(Levels[levelIndex].PopChild());
454-
Writer.AddKey(Levels[levelIndex].PopKey());
455501
}
456502
auto lastChild = Levels[levelIndex].PopChild();
457503
Writer.AddChild(lastChild);
@@ -462,6 +508,7 @@ namespace NKikimr::NTable::NPage {
462508

463509
if (levelIndex + 1 == Levels.size()) {
464510
Levels.emplace_back();
511+
Y_ABORT_UNLESS(Levels.size() < Max<ui32>(), "Levels size is out of bounds");
465512
}
466513
Levels[levelIndex + 1].PushChild(TChild{pageId, lastChild.RowCount, lastChild.DataSize, lastChild.ErasedRowCount});
467514
if (!last) {
@@ -475,8 +522,6 @@ namespace NKikimr::NTable::NPage {
475522
} else {
476523
Y_ABORT_UNLESS(Levels[levelIndex].GetKeysCount(), "Shouldn't leave empty levels");
477524
}
478-
479-
return true;
480525
}
481526

482527
size_t CalcPageSize(const TLevel& level) const {
@@ -497,6 +542,8 @@ namespace NKikimr::NTable::NPage {
497542
const ui32 NodeTargetSize;
498543
const ui32 NodeKeysMin;
499544
const ui32 NodeKeysMax;
545+
const ui32 LeafDataSizeMax;
546+
const ui32 LeafRowsCountMax;
500547

501548
TRowId ChildRowCount = 0;
502549
TRowId ChildErasedRowCount = 0;

ydb/core/tablet_flat/flat_page_conf.h

+5-3
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,11 @@ namespace NPage {
6363
ui32 PageRows = Max<ui32>(); /* Max rows per page, for UTs */
6464
ui32 IndexMin = 32 * 1024; /* Index initial buffer size */
6565

66-
ui32 BTreeIndexNodeTargetSize = 7 * 1024; /* 1 GB of (up to) 140B keys leads to 3-level B-Tree index */
67-
ui32 BTreeIndexNodeKeysMin = 6; /* 1 GB of 7KB keys leads to 6-level B-Tree index (node size - ~42KB) */
68-
ui32 BTreeIndexNodeKeysMax = Max<ui32>(); /* for UTs */
66+
ui32 BTreeIndexNodeTargetSize = 7 * 1024; /* 1 GB of (up to) 140B keys leads to 3-level B-Tree index */
67+
ui32 BTreeIndexNodeKeysMin = 6; /* 1 GB of 7KB keys leads to 6-level B-Tree index (node size - ~42KB) */
68+
ui32 BTreeIndexNodeKeysMax = Max<ui32>(); /* for UTs */
69+
ui32 BTreeIndexLeafDataSizeMax = 1024*1024; /* gDbStatsDataSizeResolution / gDbStatsResolutionMultiplier */
70+
ui32 BTreeIndexLeafRowsCountMax = 10000; /* gDbStatsRowCountResolution / gDbStatsResolutionMultiplier */
6971
};
7072

7173
struct TConf {

ydb/core/tablet_flat/flat_part_writer.h

+6-4
Original file line numberDiff line numberDiff line change
@@ -532,12 +532,12 @@ namespace NTable {
532532
if (WriteBTreeIndex) {
533533
Current.BTreeGroupIndexes.reserve(Groups.size());
534534
for (auto& g : Groups) {
535-
Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value());
535+
Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Finish(Pager));
536536
}
537537
if (Current.HistoryWritten > 0) {
538538
Current.BTreeHistoricIndexes.reserve(Histories.size());
539539
for (auto& g : Histories) {
540-
Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value());
540+
Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Finish(Pager));
541541
}
542542
}
543543
}
@@ -807,7 +807,7 @@ namespace NTable {
807807
} else {
808808
g.BTreeIndex.AddShortChild({page, dataPage->Count, raw.size()});
809809
}
810-
g.BTreeIndex.Flush(Pager, false);
810+
g.BTreeIndex.Flush(Pager);
811811
}
812812

813813
// N.B. hack to save the last row/key for the main group
@@ -1086,7 +1086,9 @@ namespace NTable {
10861086
, Codec(conf.Groups[groupId.Index].Codec)
10871087
, Data(scheme, conf, tags, groupId)
10881088
, Index(scheme, conf, groupId)
1089-
, BTreeIndex(scheme, groupId, conf.Groups[groupId.Index].BTreeIndexNodeTargetSize, conf.Groups[groupId.Index].BTreeIndexNodeKeysMin, conf.Groups[groupId.Index].BTreeIndexNodeKeysMax)
1089+
, BTreeIndex(scheme, groupId, conf.Groups[groupId.Index].BTreeIndexNodeTargetSize,
1090+
conf.Groups[groupId.Index].BTreeIndexNodeKeysMin, conf.Groups[groupId.Index].BTreeIndexNodeKeysMax,
1091+
conf.Groups[groupId.Index].BTreeIndexLeafDataSizeMax, conf.Groups[groupId.Index].BTreeIndexLeafRowsCountMax)
10901092
{ }
10911093
};
10921094

ydb/core/tablet_flat/flat_stat_part.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class TStatsScreenedPartIterator {
4848
for (bool historic : {false, true}) {
4949
for (ui32 groupIndex : xrange(historic ? Part->HistoricGroupsCount : Part->GroupsCount)) {
5050
ui64 groupRowCountResolution, groupDataSizeResolution;
51-
if (groupIndex == 0 && Part->GroupsCount > 1) {
51+
if (groupIndex == 0 && (Part->GroupsCount > 1 || Small || Large)) {
5252
// make steps as small as possible because they will affect groups resolution
5353
groupRowCountResolution = groupDataSizeResolution = 0;
5454
} else {

ydb/core/tablet_flat/flat_stat_table.cpp

+3-8
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,14 @@
66
namespace NKikimr {
77
namespace NTable {
88

9-
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, IPages* env) {
9+
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, ui32 resolutionMultiplier, IPages* env) {
1010
stats.Clear();
1111

1212
TDataStats iteratorStats = { };
1313
TStatsIterator statsIterator(subset.Scheme->Keys);
1414

15-
TSet<TEpoch> epochs;
16-
for (const auto& part : subset.Flatten) {
17-
epochs.insert(part->Epoch);
18-
}
19-
// if rowCountResolution = 300, 3-leveled SST, let's move each iterator up to 25 rows
20-
ui64 iterRowCountResolution = rowCountResolution / Max<ui64>(1, epochs.size()) / 4;
21-
ui64 iterDataSizeResolution = dataSizeResolution / Max<ui64>(1, epochs.size()) / 4;
15+
ui64 iterRowCountResolution = rowCountResolution / resolutionMultiplier;
16+
ui64 iterDataSizeResolution = dataSizeResolution / resolutionMultiplier;
2217

2318
// Make index iterators for all parts
2419
bool started = true;

ydb/core/tablet_flat/flat_stat_table.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ class TKeyAccessSample {
188188
THashMap<TString, ui64> KeyRefCount;
189189
};
190190

191-
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, IPages* env);
191+
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, ui32 resolutionMultiplier, IPages* env);
192192
void GetPartOwners(const TSubset& subset, THashSet<ui64>& partOwners);
193193

194194
}}

0 commit comments

Comments
 (0)