Skip to content

Commit 8c8a1b1

Browse files
committed
add different metrics to graph service
1 parent 37ab7bf commit 8c8a1b1

File tree

11 files changed

+169
-11
lines changed

11 files changed

+169
-11
lines changed

ydb/core/graph/api/events.h

+4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ struct TEvGraph {
2121
TEvSendMetrics() = default;
2222

2323
TEvSendMetrics(const TString& name, double value) {
24+
AddMetric(name, value);
25+
}
26+
27+
void AddMetric(const TString& name, double value) {
2428
NKikimrGraph::TMetric* metric = Record.AddMetrics();
2529
metric->SetName(name);
2630
metric->SetValue(value);

ydb/core/graph/api/service.h

+2
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,7 @@ inline TActorId MakeGraphServiceId(ui32 node = 0) {
1414

1515
IActor* CreateGraphService(const TString& database);
1616

17+
double GetTimingForPercentile(double percentile, const TVector<ui64>& values, const TVector<ui64>& /*upper*/bounds, ui64 total);
18+
1719
} // NGraph
1820
} // NKikimr

ydb/core/graph/service/service_impl.cpp

+28
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ class TGraphService : public TActor<TGraphService> {
230230
hFunc(TEvGraph::TEvMetricsResult, Handle);
231231
hFunc(TEvTxProxySchemeCache::TEvNavigateKeySetResult, Handle);
232232
hFunc(TEvTabletPipe::TEvClientConnected, Handle);
233+
hFunc(TEvTabletPipe::TEvClientDestroyed, Handle);
233234
cFunc(TEvents::TSystem::Wakeup, HandleTimeout);
234235
}
235236
}
@@ -247,5 +248,32 @@ IActor* CreateGraphService(const TString& database) {
247248
return new TGraphService(database);
248249
}
249250

251+
double GetTimingForPercentile(double percentile, const TVector<ui64>& values, const TVector<ui64>& /*upper*/bounds, ui64 total) {
252+
ui64 ppMark = total * percentile / 100;
253+
ui64 accm = 0;
254+
ui32 n = 0;
255+
while (n < bounds.size() && accm < ppMark) {
256+
if (accm + values[n] >= ppMark) {
257+
ui64 lowerBound = 0;
258+
if (n > 0) {
259+
lowerBound = bounds[n - 1];
260+
}
261+
ui64 upperBound = bounds[n];
262+
if (upperBound == std::numeric_limits<ui64>::max()) {
263+
return lowerBound; // workaround for INF bucket
264+
}
265+
ui64 currentValue = values[n];
266+
ui64 ppValue = ppMark - accm;
267+
if (currentValue == 0) {
268+
return NAN;
269+
}
270+
return (static_cast<double>(ppValue) / currentValue) * (upperBound - lowerBound) + lowerBound;
271+
}
272+
accm += values[n];
273+
n++;
274+
}
275+
return NAN;
276+
}
277+
250278
} // NGraph
251279
} // NKikimr

ydb/core/graph/shard/backends.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -237,18 +237,21 @@ bool TLocalBackend::ClearData(NTabletFlatExecutor::TTransactionContext& txc, TIn
237237
if (!rowset.IsReady()) {
238238
return false;
239239
}
240+
ui64 prevTimestamp = 0;
240241
while (!rowset.EndOfSet()) {
241242
ui64 timestamp = rowset.GetValue<Schema::MetricsValues::Timestamp>();
242243
ui64 id = rowset.GetValue<Schema::MetricsValues::Id>();
243244
db.Table<Schema::MetricsValues>().Key(timestamp, id).Delete();
244245
newStartTimestamp = TInstant::Seconds(timestamp);
245-
if (++rows >= MAX_ROWS_TO_DELETE) {
246-
break;
246+
if (timestamp != prevTimestamp && ++rows >= MAX_ROWS_TO_DELETE) { // we count as a logical row every unique timestamp
247+
break; // so for database it will be MAX_ROWS * NUM_OF_METRICS rows
247248
}
249+
prevTimestamp = timestamp;
248250
if (!rowset.Next()) {
249251
return false;
250252
}
251253
}
254+
BLOG_D("Cleared " << rows << " logical rows");
252255
return true;
253256
}
254257

ydb/core/graph/shard/shard_impl.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ void TGraphShard::Handle(TEvGraph::TEvSendMetrics::TPtr& ev) {
9191
MetricsData.Timestamp = now;
9292
MetricsData.Values.clear();
9393
}
94-
if ((now - StartTimestamp) > DURATION_CLEAR_TRIGGER && (now - ClearTimestamp) < DURATION_CLEAR_PERIOD) {
94+
if ((now - StartTimestamp) > DURATION_CLEAR_TRIGGER && (now - ClearTimestamp) > DURATION_CLEAR_PERIOD) {
9595
ClearTimestamp = now;
9696
BLOG_TRACE("Executing TxClearData");
9797
ExecuteTxClearData();

ydb/core/graph/shard/shard_impl.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class TGraphShard : public TActor<TGraphShard>, public NTabletFlatExecutor::TTab
4343
STATEFN(StateWork);
4444

4545
// how often we could issue a clear operation
46-
static constexpr TDuration DURATION_CLEAR_PERIOD = TDuration::Minutes(10);
46+
static constexpr TDuration DURATION_CLEAR_PERIOD = TDuration::Minutes(1);
4747
// after what size of metrics data we issue a clear operation
4848
static constexpr TDuration DURATION_CLEAR_TRIGGER = TDuration::Hours(25);
4949
// the maximum size of metrics data to keep

ydb/core/graph/shard/tx_clear_data.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "shard_impl.h"
22
#include "log.h"
3-
#include "schema.h"
43

54
namespace NKikimr {
65
namespace NGraph {

ydb/core/graph/shard/tx_monitoring.cpp

+34-4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,24 @@ class TTxMonitoring : public TTransactionBase<TGraphShard> {
2121
return true;
2222
}
2323

24+
static TString DumpMetricsIndex(const std::unordered_map<TString, ui64>& metricsIndex) {
25+
TStringBuilder str;
26+
str << metricsIndex.size();
27+
if (!metricsIndex.empty()) {
28+
str << " (";
29+
bool wasItem = false;
30+
for (const auto& [name, idx] : metricsIndex) {
31+
if (wasItem) {
32+
str << ", ";
33+
}
34+
str << name;
35+
wasItem = true;
36+
}
37+
str << ")";
38+
}
39+
return str;
40+
}
41+
2442
void Complete(const TActorContext& ctx) override {
2543
BLOG_D("TTxMonitoring::Complete");
2644
TStringBuilder html;
@@ -47,12 +65,24 @@ class TTxMonitoring : public TTransactionBase<TGraphShard> {
4765
}
4866
html << "</td></tr>";
4967

50-
html << "<tr><td>Memory.MetricsSize</td><td>" << Self->MemoryBackend.MetricsIndex.size() << "</td></tr>";
68+
html << "<tr><td>Memory.MetricsSize</td><td>" << DumpMetricsIndex(Self->MemoryBackend.MetricsIndex) << "</td></tr>";
5169
html << "<tr><td>Memory.RecordsSize</td><td>" << Self->MemoryBackend.MetricsValues.size() << "</td></tr>";
5270

53-
html << "<tr><td>Local.MetricsSize</td><td>" << Self->LocalBackend.MetricsIndex.size() << "</td></tr>";
54-
html << "<tr><td>StartTimestamp</td><td>" << Self->StartTimestamp << "</td></tr>";
55-
html << "<tr><td>ClearTimestamp</td><td>" << Self->ClearTimestamp << "</td></tr>";
71+
html << "<tr><td>Local.MetricsSize</td><td>" << DumpMetricsIndex(Self->LocalBackend.MetricsIndex) << "</td></tr>";
72+
html << "<tr><td>StartTimestamp</td><td>" << Self->StartTimestamp.ToIsoStringLocalUpToSeconds() << "</td></tr>";
73+
html << "<tr><td>ClearTimestamp</td><td>" << Self->ClearTimestamp.ToIsoStringLocalUpToSeconds() << "</td></tr>";
74+
html << "<tr><td>CurrentTimestamp</td><td>" << Self->MetricsData.Timestamp.ToIsoStringLocalUpToSeconds() << "</td></tr>";
75+
76+
html << "<tr><td style='vertical-align:top'>CurrentMetricsData</td><td>";
77+
bool wasLine = false;
78+
for (const auto& [name, value] : Self->MetricsData.Values) {
79+
if (wasLine) {
80+
html << "<br>";
81+
}
82+
html << name << "=" << value;
83+
wasLine = true;
84+
}
85+
html << "</td></tr>";
5686

5787
html << "</table>";
5888
html << "</html>";

ydb/core/graph/shard/ut/shard_ut.cpp

+24
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <ydb/core/tx/schemeshard/ut_helpers/helpers.h>
66
#include <util/stream/output.h>
77
#include <ydb/core/graph/shard/backends.h>
8+
#include <ydb/core/graph/api/service.h>
89

910
#ifdef NDEBUG
1011
#define Ctest Cnull
@@ -257,6 +258,29 @@ Y_UNIT_TEST_SUITE(GraphShard) {
257258
}
258259
}
259260

261+
Y_UNIT_TEST(CheckHistogramToPercentileConversions) {
262+
TVector<ui64> bounds = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, std::numeric_limits<ui64>::max()};
263+
TVector<ui64> values = {10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0};
264+
ui64 total = std::accumulate(values.begin(), values.end(), 0);
265+
UNIT_ASSERT(total == 100);
266+
auto p50 = NGraph::GetTimingForPercentile(50, values, bounds, total);
267+
Ctest << "p50=" << p50 << Endl;
268+
UNIT_ASSERT(!isnan(p50));
269+
UNIT_ASSERT(abs(p50 - 32) < 0.01); // 32ms
270+
auto p75 = NGraph::GetTimingForPercentile(75, values, bounds, total);
271+
Ctest << "p75=" << p75 << Endl;
272+
UNIT_ASSERT(!isnan(p75));
273+
UNIT_ASSERT(abs(p75 - 192) < 0.01); // 192ms
274+
auto p90 = NGraph::GetTimingForPercentile(90, values, bounds, total);
275+
Ctest << "p90=" << p90 << Endl;
276+
UNIT_ASSERT(!isnan(p90));
277+
UNIT_ASSERT(abs(p90 - 512) < 0.01); // 512ms
278+
auto p99 = NGraph::GetTimingForPercentile(99, values, bounds, total);
279+
Ctest << "p99=" << p99 << Endl;
280+
UNIT_ASSERT(!isnan(p99));
281+
UNIT_ASSERT(abs(p99 - 972.8) < 0.01); // 972.8ms
282+
}
283+
260284
TTenantTestConfig GetTenantTestConfig() {
261285
return {
262286
.Domains = {

ydb/core/sys_view/service/ext_counters.cpp

+69-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
#include <ydb/core/base/appdata.h>
44
#include <ydb/core/base/counters.h>
5+
#include <ydb/core/graph/api/events.h>
6+
#include <ydb/core/graph/api/service.h>
57
#include <ydb/library/services/services.pb.h>
68
#include <ydb/library/actors/core/actor_bootstrapped.h>
79

@@ -13,18 +15,26 @@ class TExtCountersUpdaterActor
1315
: public TActorBootstrapped<TExtCountersUpdaterActor>
1416
{
1517
using TCounterPtr = ::NMonitoring::TDynamicCounters::TCounterPtr;
18+
using THistogramPtr = ::NMonitoring::THistogramPtr;
19+
using THistogramSnapshotPtr = ::NMonitoring::IHistogramSnapshotPtr;
1620

1721
const TExtCountersConfig Config;
1822

1923
TCounterPtr MemoryUsedBytes;
2024
TCounterPtr MemoryLimitBytes;
25+
TCounterPtr StorageUsedBytes;
2126
TVector<TCounterPtr> CpuUsedCorePercents;
2227
TVector<TCounterPtr> CpuLimitCorePercents;
28+
THistogramPtr ExecuteLatencyMs;
2329

2430
TCounterPtr AnonRssSize;
2531
TCounterPtr CGroupMemLimit;
2632
TVector<TCounterPtr> PoolElapsedMicrosec;
2733
TVector<TCounterPtr> PoolCurrentThreadCount;
34+
TVector<ui64> PoolElapsedMicrosecPrevValue;
35+
TVector<ui64> ExecuteLatencyMsValues;
36+
TVector<ui64> ExecuteLatencyMsPrevValues;
37+
TVector<ui64> ExecuteLatencyMsBounds;
2838

2939
public:
3040
static constexpr NKikimrServices::TActivity::EType ActorActivityType() {
@@ -42,6 +52,8 @@ class TExtCountersUpdaterActor
4252
"resources.memory.used_bytes", false);
4353
MemoryLimitBytes = ydbGroup->GetNamedCounter("name",
4454
"resources.memory.limit_bytes", false);
55+
StorageUsedBytes = ydbGroup->GetNamedCounter("name",
56+
"resources.storage.used_bytes", false);
4557

4658
auto poolCount = Config.Pools.size();
4759
CpuUsedCorePercents.resize(poolCount);
@@ -55,6 +67,8 @@ class TExtCountersUpdaterActor
5567
"resources.cpu.limit_core_percents", false);
5668
}
5769

70+
ExecuteLatencyMs = ydbGroup->FindNamedHistogram("name", "table.query.execution.latency_milliseconds");
71+
5872
Schedule(TDuration::Seconds(1), new TEvents::TEvWakeup);
5973
Become(&TThis::StateWork);
6074
}
@@ -69,29 +83,41 @@ class TExtCountersUpdaterActor
6983

7084
PoolElapsedMicrosec.resize(Config.Pools.size());
7185
PoolCurrentThreadCount.resize(Config.Pools.size());
86+
PoolElapsedMicrosecPrevValue.resize(Config.Pools.size());
7287
for (size_t i = 0; i < Config.Pools.size(); ++i) {
7388
auto poolGroup = utilsGroup->FindSubgroup("execpool", Config.Pools[i].Name);
7489
if (poolGroup) {
7590
PoolElapsedMicrosec[i] = poolGroup->FindCounter("ElapsedMicrosec");
7691
PoolCurrentThreadCount[i] = poolGroup->FindCounter("CurrentThreadCount");
92+
if (PoolElapsedMicrosec[i]) {
93+
PoolElapsedMicrosecPrevValue[i] = PoolElapsedMicrosec[i]->Val();
94+
}
7795
}
7896
}
7997
}
8098
}
8199

82100
void Transform() {
83101
Initialize();
84-
102+
auto metrics(MakeHolder<NGraph::TEvGraph::TEvSendMetrics>());
85103
if (AnonRssSize) {
86104
MemoryUsedBytes->Set(AnonRssSize->Val());
105+
metrics->AddMetric("resources.memory.used_bytes", AnonRssSize->Val());
87106
}
88107
if (CGroupMemLimit) {
89108
MemoryLimitBytes->Set(CGroupMemLimit->Val());
90109
}
110+
metrics->AddMetric("resources.storage.used_bytes", StorageUsedBytes->Val());
111+
double cpuUsage = 0;
91112
for (size_t i = 0; i < Config.Pools.size(); ++i) {
92113
if (PoolElapsedMicrosec[i]) {
93-
double usedCore = PoolElapsedMicrosec[i]->Val() / 10000.;
114+
auto elapsedMs = PoolElapsedMicrosec[i]->Val();
115+
double usedCore = elapsedMs / 10000.;
94116
CpuUsedCorePercents[i]->Set(usedCore);
117+
if (PoolElapsedMicrosecPrevValue[i] != 0) {
118+
cpuUsage += (elapsedMs - PoolElapsedMicrosecPrevValue[i]) / 1000000.;
119+
}
120+
PoolElapsedMicrosecPrevValue[i] = elapsedMs;
95121
}
96122
if (PoolCurrentThreadCount[i] && PoolCurrentThreadCount[i]->Val()) {
97123
double limitCore = PoolCurrentThreadCount[i]->Val() * 100;
@@ -101,6 +127,47 @@ class TExtCountersUpdaterActor
101127
CpuLimitCorePercents[i]->Set(limitCore);
102128
}
103129
}
130+
metrics->AddMetric("resources.cpu.usage", cpuUsage);
131+
if (ExecuteLatencyMs) {
132+
THistogramSnapshotPtr snapshot = ExecuteLatencyMs->Snapshot();
133+
ui32 count = snapshot->Count();
134+
if (ExecuteLatencyMsValues.empty()) {
135+
ExecuteLatencyMsValues.resize(count);
136+
ExecuteLatencyMsPrevValues.resize(count);
137+
ExecuteLatencyMsBounds.resize(count);
138+
}
139+
ui64 total = 0;
140+
for (ui32 n = 0; n < count; ++n) {
141+
ui64 value = snapshot->Value(n);;
142+
ui64 diff = value - ExecuteLatencyMsPrevValues[n];
143+
total += diff;
144+
ExecuteLatencyMsValues[n] = diff;
145+
ExecuteLatencyMsPrevValues[n] = value;
146+
if (ExecuteLatencyMsBounds[n] == 0) {
147+
ExecuteLatencyMsBounds[n] = snapshot->UpperBound(n);
148+
}
149+
}
150+
metrics->AddMetric("queries.requests", total);
151+
if (total != 0) {
152+
double p50 = NGraph::GetTimingForPercentile(50, ExecuteLatencyMsValues, ExecuteLatencyMsBounds, total);
153+
if (!isnan(p50)) {
154+
metrics->AddMetric("queries.latencies.p50", p50);
155+
}
156+
double p75 = NGraph::GetTimingForPercentile(75, ExecuteLatencyMsValues, ExecuteLatencyMsBounds, total);
157+
if (!isnan(p75)) {
158+
metrics->AddMetric("queries.latencies.p75", p75);
159+
}
160+
double p90 = NGraph::GetTimingForPercentile(90, ExecuteLatencyMsValues, ExecuteLatencyMsBounds, total);
161+
if (!isnan(p90)) {
162+
metrics->AddMetric("queries.latencies.p90", p90);
163+
}
164+
double p99 = NGraph::GetTimingForPercentile(99, ExecuteLatencyMsValues, ExecuteLatencyMsBounds, total);
165+
if (!isnan(p99)) {
166+
metrics->AddMetric("queries.latencies.p99", p99);
167+
}
168+
}
169+
}
170+
Send(NGraph::MakeGraphServiceId(), metrics.Release());
104171
}
105172

106173
void HandleWakeup() {

ydb/core/sys_view/service/ya.make

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ PEERDIR(
1616
ydb/library/actors/core
1717
ydb/core/base
1818
ydb/core/protos
19+
ydb/core/graph/api
1920
ydb/library/aclib/protos
2021
)
2122

0 commit comments

Comments
 (0)