Skip to content

Commit f93a9f3

Browse files
authored
TTableHistogramBuilderBtreeIndex improvements (#5451)
1 parent 71dea30 commit f93a9f3

12 files changed

+115
-54
lines changed

ydb/core/protos/tx_datashard.proto

+1
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,7 @@ message TEvGetTableStats {
769769
optional uint64 DataSizeResolution = 2;
770770
optional uint64 RowCountResolution = 3;
771771
optional bool CollectKeySample = 4;
772+
optional uint32 HistogramBucketsCount = 5;
772773
}
773774

774775
message TEvGetTableStatsResult {

ydb/core/tablet_flat/benchmark/b_part.cpp

+31-16
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <ydb/core/tablet_flat/test/libs/table/wrap_part.h>
1818
#include <ydb/core/tablet_flat/test/libs/table/test_steps.h>
1919

20+
// ya test -r -D BENCHMARK_MAKE_LARGE_PART
2021
#ifndef BENCHMARK_MAKE_LARGE_PART
2122
#define BENCHMARK_MAKE_LARGE_PART 0
2223
#endif
@@ -37,8 +38,6 @@ namespace {
3738

3839
conf.WriteBTreeIndex = writeBTreeIndex;
3940

40-
conf.SliceSize = conf.Group(0).PageSize * 4;
41-
4241
return conf;
4342
}
4443

@@ -48,24 +47,33 @@ namespace {
4847
void SetUp(::benchmark::State& state)
4948
{
5049
const bool useBTree = state.range(0);
51-
const bool groups = state.range(1);
52-
const bool history = state.range(2);
50+
const ui32 partsCount = state.range(1);
51+
const bool groups = state.range(2);
52+
const bool history = state.range(3);
5353

5454
ui64 rows = history ? 300000 : 1000000;
5555
if (BENCHMARK_MAKE_LARGE_PART) {
5656
rows *= 10;
5757
}
5858
Mass = new NTest::TMass(new NTest::TModelStd(groups), rows);
59-
Subset = TMake(*Mass, PageConf(Mass->Model->Scheme->Families.size(), useBTree)).Mixed(0, 1, TMixerOne{ }, history ? 0.7 : 0);
59+
Subset = TMake(*Mass, PageConf(Mass->Model->Scheme->Families.size(), useBTree)).Mixed(0, partsCount, TMixerRnd(partsCount), history ? 0.7 : 0);
6060

61+
ui64 dataBytes = 0, dataPages = 0, indexBytes = 0;
62+
ui32 bTreeLevels = 0;
6163
for (const auto& part : Subset->Flatten) { // single part
62-
state.counters["DataBytes"] = part->Stat.Bytes;
63-
state.counters["DataPages"] = IndexTools::CountMainPages(*part);
64-
state.counters["IndexBytes"] = part->IndexesRawSize;
64+
dataBytes += part->Stat.Bytes;
65+
dataPages += IndexTools::CountMainPages(*part);
66+
indexBytes += part->IndexesRawSize;
6567
if (useBTree) {
66-
state.counters["Levels{0}"] = part->IndexPages.BTreeGroups[0].LevelCount;
68+
bTreeLevels = Max(bTreeLevels, part->IndexPages.BTreeGroups[0].LevelCount);
6769
}
6870
}
71+
state.counters["DataBytes"] = dataBytes;
72+
state.counters["DataPages"] = dataPages;
73+
state.counters["IndexBytes"] = indexBytes;
74+
if (useBTree) {
75+
state.counters["Levels{0}"] = bTreeLevels;
76+
}
6977

7078
if (history) {
7179
Checker = new TCheckIter(*Subset, {new TTestEnv()}, TRowVersion(0, 8));
@@ -150,7 +158,7 @@ BENCHMARK_DEFINE_F(TPartFixture, Prev)(benchmark::State& state) {
150158

151159
BENCHMARK_DEFINE_F(TPartFixture, SeekKey)(benchmark::State& state) {
152160
const bool useBTree = state.range(0);
153-
const ESeek seek = ESeek(state.range(3));
161+
const ESeek seek = ESeek(state.range(4));
154162

155163
TRowTool rowTool(*Subset->Scheme);
156164
auto tags = TVector<TTag>();
@@ -178,9 +186,9 @@ BENCHMARK_DEFINE_F(TPartFixture, SeekKey)(benchmark::State& state) {
178186
}
179187

180188
BENCHMARK_DEFINE_F(TPartFixture, DoReads)(benchmark::State& state) {
181-
const bool reverse = state.range(3);
182-
const ESeek seek = static_cast<ESeek>(state.range(4));
183-
const ui32 items = state.range(5);
189+
const bool reverse = state.range(4);
190+
const ESeek seek = static_cast<ESeek>(state.range(5));
191+
const ui32 items = state.range(6);
184192

185193
for (auto _ : state) {
186194
auto it = Mass->Saved.Any(Rnd);
@@ -200,8 +208,8 @@ BENCHMARK_DEFINE_F(TPartFixture, DoReads)(benchmark::State& state) {
200208
}
201209

202210
BENCHMARK_DEFINE_F(TPartFixture, DoCharge)(benchmark::State& state) {
203-
const bool reverse = state.range(3);
204-
const ui32 items = state.range(4);
211+
const bool reverse = state.range(4);
212+
const ui32 items = state.range(5);
205213

206214
auto tags = TVector<TTag>();
207215
for (auto c : Subset->Scheme->Cols) {
@@ -226,34 +234,38 @@ BENCHMARK_DEFINE_F(TPartFixture, DoCharge)(benchmark::State& state) {
226234
BENCHMARK_DEFINE_F(TPartFixture, BuildStats)(benchmark::State& state) {
227235
for (auto _ : state) {
228236
TStats stats;
229-
BuildStats(*Subset, stats, NDataShard::gDbStatsRowCountResolution, NDataShard::gDbStatsDataSizeResolution, &Env, [](){});
237+
BuildStats(*Subset, stats, NDataShard::gDbStatsRowCountResolution, NDataShard::gDbStatsDataSizeResolution, NDataShard::gDbStatsHistogramBucketsCount, &Env, [](){});
230238
}
231239
}
232240

233241
BENCHMARK_REGISTER_F(TPartFixture, SeekRowId)
234242
->ArgsProduct({
235243
/* b-tree */ {0, 1},
244+
/* parts */ {4},
236245
/* groups: */ {0, 1},
237246
/* history: */ {0}})
238247
->Unit(benchmark::kMicrosecond);
239248

240249
BENCHMARK_REGISTER_F(TPartFixture, Next)
241250
->ArgsProduct({
242251
/* b-tree */ {0, 1},
252+
/* parts */ {4},
243253
/* groups: */ {0, 1},
244254
/* history: */ {0}})
245255
->Unit(benchmark::kMicrosecond);
246256

247257
BENCHMARK_REGISTER_F(TPartFixture, Prev)
248258
->ArgsProduct({
249259
/* b-tree */ {0, 1},
260+
/* parts */ {4},
250261
/* groups: */ {0, 1},
251262
/* history: */ {0}})
252263
->Unit(benchmark::kMicrosecond);
253264

254265
BENCHMARK_REGISTER_F(TPartFixture, SeekKey)
255266
->ArgsProduct({
256267
/* b-tree */ {0, 1},
268+
/* parts */ {4},
257269
/* groups: */ {0, 1},
258270
/* history: */ {0},
259271
/* ESeek: */ {1}})
@@ -262,6 +274,7 @@ BENCHMARK_REGISTER_F(TPartFixture, SeekKey)
262274
BENCHMARK_REGISTER_F(TPartFixture, DoReads)
263275
->ArgsProduct({
264276
/* b-tree */ {0, 1},
277+
/* parts */ {4},
265278
/* groups: */ {1},
266279
/* history: */ {1},
267280
/* reverse: */ {0},
@@ -272,6 +285,7 @@ BENCHMARK_REGISTER_F(TPartFixture, DoReads)
272285
BENCHMARK_REGISTER_F(TPartFixture, DoCharge)
273286
->ArgsProduct({
274287
/* b-tree */ {0, 1},
288+
/* parts */ {4},
275289
/* groups: */ {1},
276290
/* history: */ {1},
277291
/* reverse: */ {0},
@@ -281,6 +295,7 @@ BENCHMARK_REGISTER_F(TPartFixture, DoCharge)
281295
BENCHMARK_REGISTER_F(TPartFixture, BuildStats)
282296
->ArgsProduct({
283297
/* b-tree */ {0, 1},
298+
/* parts */ {1, 4, 10},
284299
/* groups: */ {0, 1},
285300
/* history: */ {0, 1}})
286301
->Unit(benchmark::kMicrosecond);

ydb/core/tablet_flat/flat_stat_table.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
namespace NKikimr {
88
namespace NTable {
99

10-
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, IPages* env, TBuildStatsYieldHandler yieldHandler) {
10+
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, ui32 histogramBucketsCount, IPages* env, TBuildStatsYieldHandler yieldHandler) {
1111
stats.Clear();
1212

1313
bool mixedIndex = false;
@@ -22,7 +22,7 @@ bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, u
2222

2323
return mixedIndex
2424
? BuildStatsMixedIndex(subset, stats, rowCountResolution, dataSizeResolution, env, yieldHandler)
25-
: BuildStatsBTreeIndex(subset, stats, rowCountResolution, dataSizeResolution, env, yieldHandler);
25+
: BuildStatsBTreeIndex(subset, stats, histogramBucketsCount, env, yieldHandler);
2626
}
2727

2828
void GetPartOwners(const TSubset& subset, THashSet<ui64>& partOwners) {

ydb/core/tablet_flat/flat_stat_table.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ class TKeyAccessSample {
190190

191191
using TBuildStatsYieldHandler = std::function<void()>;
192192

193-
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, IPages* env, TBuildStatsYieldHandler yieldHandler);
193+
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, ui32 histogramBucketsCount, IPages* env, TBuildStatsYieldHandler yieldHandler);
194194
void GetPartOwners(const TSubset& subset, THashSet<ui64>& partOwners);
195195

196196
}}

ydb/core/tablet_flat/flat_stat_table_btree_index.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ bool AddDataSize(const TPartView& part, TStats& stats, IPages* env, TBuildStatsY
196196

197197
}
198198

199-
inline bool BuildStatsBTreeIndex(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, IPages* env, TBuildStatsYieldHandler yieldHandler) {
199+
inline bool BuildStatsBTreeIndex(const TSubset& subset, TStats& stats, ui32 histogramBucketsCount, IPages* env, TBuildStatsYieldHandler yieldHandler) {
200200
stats.Clear();
201201

202202
bool ready = true;
@@ -209,7 +209,7 @@ inline bool BuildStatsBTreeIndex(const TSubset& subset, TStats& stats, ui64 rowC
209209
return false;
210210
}
211211

212-
ready &= BuildStatsHistogramsBTreeIndex(subset, stats, rowCountResolution, dataSizeResolution, env, yieldHandler);
212+
ready &= BuildStatsHistogramsBTreeIndex(subset, stats, histogramBucketsCount, env, yieldHandler);
213213

214214
return ready;
215215
}

ydb/core/tablet_flat/flat_stat_table_btree_index_histogram.h

+13-7
Original file line numberDiff line numberDiff line change
@@ -127,17 +127,22 @@ class TTableHistogramBuilderBtreeIndex {
127127
};
128128

129129
public:
130-
TTableHistogramBuilderBtreeIndex(const TSubset& subset, IPages* env, TBuildStatsYieldHandler yieldHandler)
130+
TTableHistogramBuilderBtreeIndex(const TSubset& subset, IPages* env, ui32 histogramBucketsCount, TBuildStatsYieldHandler yieldHandler)
131131
: Subset(subset)
132132
, KeyDefaults(*Subset.Scheme->Keys)
133133
, Env(env)
134+
, HistogramBucketsCount(histogramBucketsCount)
134135
, YieldHandler(yieldHandler)
135136
{
136137
}
137138

138139
template <typename TGetSize>
139-
bool Build(THistogram& histogram, ui64 resolution, ui64 statTotalSize) {
140-
Resolution = resolution;
140+
bool Build(THistogram& histogram, ui64 statTotalSize) {
141+
if (!HistogramBucketsCount) {
142+
return true;
143+
}
144+
145+
Resolution = statTotalSize / HistogramBucketsCount;
141146
StatTotalSize = statTotalSize;
142147

143148
bool ready = true;
@@ -497,6 +502,7 @@ class TTableHistogramBuilderBtreeIndex {
497502
const TSubset& Subset;
498503
const TKeyCellDefaults& KeyDefaults;
499504
IPages* const Env;
505+
ui32 HistogramBucketsCount;
500506
TBuildStatsYieldHandler YieldHandler;
501507
ui64 Resolution, StatTotalSize;
502508
TDeque<TBtreeIndexNode> LoadedBTreeNodes; // keep nodes to use TCellsIterable key refs
@@ -505,13 +511,13 @@ class TTableHistogramBuilderBtreeIndex {
505511

506512
}
507513

508-
inline bool BuildStatsHistogramsBTreeIndex(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, IPages* env, TBuildStatsYieldHandler yieldHandler) {
514+
inline bool BuildStatsHistogramsBTreeIndex(const TSubset& subset, TStats& stats, ui32 histogramBucketsCount, IPages* env, TBuildStatsYieldHandler yieldHandler) {
509515
bool ready = true;
510516

511-
TTableHistogramBuilderBtreeIndex builder(subset, env, yieldHandler);
517+
TTableHistogramBuilderBtreeIndex builder(subset, env, histogramBucketsCount, yieldHandler);
512518

513-
ready &= builder.Build<TTableHistogramBuilderBtreeIndex::TGetRowCount>(stats.RowCountHistogram, rowCountResolution, stats.RowCount);
514-
ready &= builder.Build<TTableHistogramBuilderBtreeIndex::TGetDataSize>(stats.DataSizeHistogram, dataSizeResolution, stats.DataSize.Size);
519+
ready &= builder.Build<TTableHistogramBuilderBtreeIndex::TGetRowCount>(stats.RowCountHistogram, stats.RowCount);
520+
ready &= builder.Build<TTableHistogramBuilderBtreeIndex::TGetDataSize>(stats.DataSizeHistogram, stats.DataSize.Size);
515521

516522
return ready;
517523
}

ydb/core/tablet_flat/ut/ut_stat.cpp

+30-7
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,13 @@ namespace {
9292
UNIT_ASSERT_VALUES_EQUAL(stats.IndexSize.Size, expectedIndex);
9393
}
9494

95-
void CheckBTreeIndex(const TSubset& subset, ui64 expectedRows, ui64 expectedData, ui64 expectedIndex, ui64 rowCountResolution = 531, ui64 dataSizeResolution = 53105) {
95+
void CheckBTreeIndex(const TSubset& subset, ui64 expectedRows, ui64 expectedData, ui64 expectedIndex, ui32 histogramBucketsCount = 10) {
9696
TStats stats;
9797
TTouchEnv env;
9898

9999
const ui32 attempts = 25;
100100
for (ui32 attempt : xrange(attempts)) {
101-
if (NTable::BuildStatsBTreeIndex(subset, stats, rowCountResolution, dataSizeResolution, &env, [](){})) {
101+
if (NTable::BuildStatsBTreeIndex(subset, stats, histogramBucketsCount, &env, [](){})) {
102102
break;
103103
}
104104
UNIT_ASSERT_C(attempt + 1 < attempts, "Too many attempts");
@@ -554,7 +554,7 @@ Y_UNIT_TEST_SUITE(BuildStatsHistogram) {
554554
}
555555
}
556556

557-
void Check(const TSubset& subset, TMode mode, ui32 buckets = 10, bool verifyPercents = true) {
557+
void Check(const TSubset& subset, TMode mode, ui32 histogramBucketsCount = 10, bool verifyPercents = true) {
558558
if (mode == 0) {
559559
Dump(subset);
560560
}
@@ -567,21 +567,21 @@ Y_UNIT_TEST_SUITE(BuildStatsHistogram) {
567567
CalcDataBefore(subset, TSerializedCellVec(emptyKey), totalBytes, totalRows);
568568
}
569569

570-
ui64 rowCountResolution = totalRows / buckets;
571-
ui64 dataSizeResolution = totalBytes / buckets;
570+
ui64 rowCountResolution = totalRows / histogramBucketsCount;
571+
ui64 dataSizeResolution = totalBytes / histogramBucketsCount;
572572

573573
TTouchEnv env;
574574
// env.Faulty = false; // uncomment for debug
575575
TStats stats;
576576
auto buildStats = [&]() {
577577
if (mode == BTreeIndex) {
578-
return NTable::BuildStatsBTreeIndex(subset, stats, rowCountResolution, dataSizeResolution, &env, [](){});
578+
return NTable::BuildStatsBTreeIndex(subset, stats, histogramBucketsCount, &env, [](){});
579579
} else {
580580
return NTable::BuildStatsMixedIndex(subset, stats, rowCountResolution, dataSizeResolution, &env, [](){});
581581
}
582582
};
583583

584-
const ui32 attempts = 25;
584+
const ui32 attempts = 35;
585585
for (ui32 attempt : xrange(attempts)) {
586586
if (buildStats()) {
587587
break;
@@ -1010,6 +1010,29 @@ Y_UNIT_TEST_SUITE(BuildStatsHistogram) {
10101010
Check(*subset, mode);
10111011
}
10121012
}
1013+
1014+
// this test uses same data as benchmark::TPartFixture/BuildStats/*, but may be debugged and prints result histograms
1015+
Y_UNIT_TEST(Benchmark)
1016+
{
1017+
const ui32 partsCount = 4;
1018+
const bool groups = false;
1019+
const bool history = false;
1020+
ui64 rowsCount = history ? 300000 : 1000000;
1021+
1022+
rowsCount /= 100; // to be faster
1023+
1024+
TAutoPtr<TMass> mass = new NTest::TMass(new NTest::TModelStd(groups), rowsCount);
1025+
1026+
for (auto mode : {BTreeIndex, FlatIndex, MixedIndex}) {
1027+
NPage::TConf conf;
1028+
conf.Groups.resize(mass->Model->Scheme->Families.size());
1029+
conf.WriteBTreeIndex = (mode == FlatIndex ? false : true);
1030+
1031+
TAutoPtr<TSubset> subset = TMake(*mass, conf).Mixed(0, partsCount, TMixerRnd(partsCount), history ? 0.7 : 0);
1032+
1033+
Check(*subset, mode, 10, false);
1034+
}
1035+
}
10131036
}
10141037

10151038
}

ydb/core/tx/datashard/datashard.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ bool gAllowLogBatchingDefaultValue = true;
3737
TDuration gDbStatsReportInterval = TDuration::Seconds(10);
3838
ui64 gDbStatsDataSizeResolution = 10*1024*1024;
3939
ui64 gDbStatsRowCountResolution = 100000;
40+
ui32 gDbStatsHistogramBucketsCount = 10;
4041

4142
// The first byte is 0x01 so it would fail to parse as an internal tablet protobuf
4243
TStringBuf SnapshotTransferReadSetMagic("\x01SRS", 4);

ydb/core/tx/datashard/datashard.h

+2-4
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ namespace NDataShard {
151151
extern TDuration gDbStatsReportInterval;
152152
extern ui64 gDbStatsDataSizeResolution;
153153
extern ui64 gDbStatsRowCountResolution;
154+
extern ui32 gDbStatsHistogramBucketsCount;
154155

155156
// This SeqNo is used to discard outdated schema Tx requests on datashards.
156157
// In case of tablet restart on network disconnects SS can resend same Propose for the same schema Tx.
@@ -829,11 +830,8 @@ struct TEvDataShard {
829830
NKikimrTxDataShard::TEvGetTableStats,
830831
TEvDataShard::EvGetTableStats> {
831832
TEvGetTableStats() = default;
832-
explicit TEvGetTableStats(ui64 tableId, ui64 dataSizeResolution = 0, ui64 rowCountResolution = 0, bool collectKeySample = false) {
833+
explicit TEvGetTableStats(ui64 tableId) {
833834
Record.SetTableId(tableId);
834-
Record.SetDataSizeResolution(dataSizeResolution);
835-
Record.SetRowCountResolution(rowCountResolution);
836-
Record.SetCollectKeySample(collectKeySample);
837835
}
838836
};
839837

0 commit comments

Comments
 (0)