Skip to content

Commit 50a24e3

Browse files
authored
Improve BlobDepot assimilator metrics (#4030)
1 parent 2695540 commit 50a24e3

File tree

4 files changed

+157
-47
lines changed

4 files changed

+157
-47
lines changed

ydb/core/blob_depot/assimilator.cpp

Lines changed: 112 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ namespace NKikimr::NBlobDepot {
9494

9595
Become(&TThis::StateFunc);
9696
Action();
97+
UpdateBytesCopiedQ();
9798
}
9899

99100
void TAssimilator::PassAway() {
@@ -116,8 +117,10 @@ namespace NKikimr::NBlobDepot {
116117
hFunc(TEvTabletPipe::TEvClientDestroyed, Handle);
117118
hFunc(TEvBlobStorage::TEvControllerGroupDecommittedResponse, Handle);
118119
cFunc(TEvPrivate::EvResume, Action);
120+
cFunc(TEvPrivate::EvResumeScanDataForPlanning, HandleResumeScanDataForPlanning);
119121
cFunc(TEvPrivate::EvResumeScanDataForCopying, HandleResumeScanDataForCopying);
120122
fFunc(TEvPrivate::EvTxComplete, HandleTxComplete);
123+
cFunc(TEvPrivate::EvUpdateBytesCopiedQ, UpdateBytesCopiedQ);
121124
cFunc(TEvents::TSystem::Poison, PassAway);
122125

123126
default:
@@ -134,7 +137,11 @@ namespace NKikimr::NBlobDepot {
134137
if (Self->DecommitState < EDecommitState::BlobsFinished) {
135138
SendAssimilateRequest();
136139
} else if (Self->DecommitState < EDecommitState::BlobsCopied) {
137-
ScanDataForCopying();
140+
if (PlanningComplete) {
141+
ScanDataForCopying();
142+
} else {
143+
ScanDataForPlanning();
144+
}
138145
} else if (Self->DecommitState == EDecommitState::BlobsCopied) {
139146
Y_ABORT_UNLESS(!PipeId);
140147
CreatePipe();
@@ -283,7 +290,61 @@ namespace NKikimr::NBlobDepot {
283290
}
284291
}
285292

293+
void TAssimilator::ScanDataForPlanning() {
294+
if (ResumeScanDataForPlanningInFlight) {
295+
return;
296+
}
297+
298+
THPTimer timer;
299+
ui32 numItems = 0;
300+
bool timeout = false;
301+
302+
if (!LastPlanScannedKey) {
303+
++Self->Assimilator.CopyIteration;
304+
Self->Assimilator.BytesToCopy = 0;
305+
}
306+
307+
TData::TScanRange range{
308+
LastPlanScannedKey ? TData::TKey(*LastPlanScannedKey) : TData::TKey::Min(),
309+
TData::TKey::Max(),
310+
};
311+
Self->Data->ScanRange(range, nullptr, nullptr, [&](const TData::TKey& key, const TData::TValue& value) {
312+
if (++numItems == 1000) {
313+
numItems = 0;
314+
if (TDuration::Seconds(timer.Passed()) >= TDuration::MilliSeconds(1)) {
315+
timeout = true;
316+
return false;
317+
}
318+
}
319+
if (value.GoingToAssimilate) {
320+
Self->Assimilator.BytesToCopy += key.GetBlobId().BlobSize();
321+
}
322+
LastPlanScannedKey.emplace(key.GetBlobId());
323+
return true;
324+
});
325+
326+
if (timeout) {
327+
ResumeScanDataForPlanningInFlight = true;
328+
TActivationContext::Send(new IEventHandle(TEvPrivate::EvResumeScanDataForPlanning, 0, SelfId(), {}, nullptr, 0));
329+
return;
330+
}
331+
332+
ActionInProgress = false;
333+
PlanningComplete = true;
334+
Action();
335+
}
336+
337+
void TAssimilator::HandleResumeScanDataForPlanning() {
338+
Y_ABORT_UNLESS(ResumeScanDataForPlanningInFlight);
339+
ResumeScanDataForPlanningInFlight = false;
340+
ScanDataForPlanning();
341+
}
342+
286343
void TAssimilator::ScanDataForCopying() {
344+
if (ResumeScanDataForCopyingInFlight) {
345+
return;
346+
}
347+
287348
STLOG(PRI_DEBUG, BLOB_DEPOT, BDT54, "TAssimilator::ScanDataForCopying", (Id, Self->GetLogId()),
288349
(LastScannedKey, LastScannedKey), (NumGetsUnprocessed, GetIdToUnprocessedPuts.size()));
289350

@@ -324,11 +385,8 @@ namespace NKikimr::NBlobDepot {
324385
(EntriesToProcess, EntriesToProcess), (Timeout, timeout), (NumGetsUnprocessed, GetIdToUnprocessedPuts.size()));
325386

326387
if (timeout) { // timeout hit, reschedule work
327-
if (!ResumeScanDataForCopyingInFlight) {
328-
TActivationContext::Send(new IEventHandle(TEvPrivate::EvResumeScanDataForCopying, 0, SelfId(), {}, nullptr, 0));
329-
ResumeScanDataForCopyingInFlight = true;
330-
}
331-
break;
388+
TActivationContext::Send(new IEventHandle(TEvPrivate::EvResumeScanDataForCopying, 0, SelfId(), {}, nullptr, 0));
389+
ResumeScanDataForCopyingInFlight = true;
332390
} else if (!ScanQ.empty()) {
333391
using TQuery = TEvBlobStorage::TEvGet::TQuery;
334392
const ui32 sz = ScanQ.size();
@@ -345,15 +403,18 @@ namespace NKikimr::NBlobDepot {
345403
GetIdToUnprocessedPuts.try_emplace(getId);
346404
ScanQ.clear();
347405
TotalSize = 0;
348-
} else if (!GetIdToUnprocessedPuts.empty()) { // there are some unprocessed get queries, still have to wait
349-
break;
406+
continue;
407+
} else if (!GetIdToUnprocessedPuts.empty()) {
408+
// there are some unprocessed get queries, still have to wait
350409
} else if (!EntriesToProcess) { // we have finished scanning the whole table without any entries, copying is done
351410
OnCopyDone();
352-
break;
353411
} else { // we have finished scanning, but we have replicated some data, restart scanning to ensure that nothing left
354412
LastScannedKey.reset();
355-
EntriesToProcess = false;
413+
LastPlanScannedKey.reset();
414+
EntriesToProcess = PlanningComplete = ActionInProgress = false;
415+
Action();
356416
}
417+
break;
357418
}
358419
}
359420

@@ -365,7 +426,7 @@ namespace NKikimr::NBlobDepot {
365426

366427
void TAssimilator::Handle(TEvBlobStorage::TEvGetResult::TPtr ev) {
367428
auto& msg = *ev->Get();
368-
(msg.Status == NKikimrProto::OK ? Self->AssimilatorLatestOkGet : Self->AssimilatorLatestOkPut) = TInstant::Now();
429+
(msg.Status == NKikimrProto::OK ? Self->Assimilator.LatestOkGet : Self->Assimilator.LatestErrorGet) = TInstant::Now();
369430
const auto it = GetIdToUnprocessedPuts.find(ev->Cookie);
370431
Y_ABORT_UNLESS(it != GetIdToUnprocessedPuts.end());
371432
ui32 getBytes = 0;
@@ -389,26 +450,25 @@ namespace NKikimr::NBlobDepot {
389450
++it->second;
390451
}
391452
getBytes += resp.Id.BlobSize();
392-
++Self->AssimilatorBlobsReadOk;
453+
++Self->Assimilator.BlobsReadOk;
393454
} else if (resp.Status == NKikimrProto::NODATA) {
394455
Self->Data->ExecuteTxCommitAssimilatedBlob(NKikimrProto::NODATA, TBlobSeqId(), TData::TKey(resp.Id),
395456
TEvPrivate::EvTxComplete, SelfId(), it->first);
396457
++it->second;
397-
++Self->AssimilatorBlobsReadNoData;
458+
++Self->Assimilator.BlobsReadNoData;
459+
Self->Assimilator.BytesToCopy -= resp.Id.BlobSize();
398460
} else {
399-
++Self->AssimilatorBlobsReadError;
461+
++Self->Assimilator.BlobsReadError;
400462
continue;
401463
}
402-
Self->AssimilatorLastReadBlobId = resp.Id;
464+
Self->Assimilator.LastReadBlobId = resp.Id;
403465
}
404466
if (getBytes) {
405467
Self->TabletCounters->Cumulative()[NKikimrBlobDepot::COUNTER_DECOMMIT_GET_BYTES] += getBytes;
406468
}
407469
if (!it->second) {
408470
GetIdToUnprocessedPuts.erase(it);
409-
if (!ResumeScanDataForCopyingInFlight) {
410-
ScanDataForCopying();
411-
}
471+
ScanDataForCopying();
412472
}
413473
}
414474

@@ -417,20 +477,20 @@ namespace NKikimr::NBlobDepot {
417477
Y_ABORT_UNLESS(it != GetIdToUnprocessedPuts.end());
418478
if (!--it->second) {
419479
GetIdToUnprocessedPuts.erase(it);
420-
if (!ResumeScanDataForCopyingInFlight) {
421-
ScanDataForCopying();
422-
}
480+
ScanDataForCopying();
423481
}
424482
}
425483

426484
void TAssimilator::Handle(TEvBlobStorage::TEvPutResult::TPtr ev) {
427485
auto& msg = *ev->Get();
428-
(msg.Status == NKikimrProto::OK ? Self->AssimilatorLatestOkPut : Self->AssimilatorLatestErrorPut) = TInstant::Now();
486+
(msg.Status == NKikimrProto::OK ? Self->Assimilator.LatestOkPut : Self->Assimilator.LatestErrorPut) = TInstant::Now();
429487
if (msg.Status == NKikimrProto::OK) {
430488
Self->TabletCounters->Cumulative()[NKikimrBlobDepot::COUNTER_DECOMMIT_PUT_OK_BYTES] += msg.Id.BlobSize();
431-
++Self->AssimilatorBlobsPutOk;
489+
++Self->Assimilator.BlobsPutOk;
490+
Self->Assimilator.BytesToCopy -= msg.Id.BlobSize();
491+
Self->Assimilator.BytesCopied += msg.Id.BlobSize();
432492
} else {
433-
++Self->AssimilatorBlobsPutError;
493+
++Self->Assimilator.BlobsPutError;
434494
}
435495
const auto it = PutIdToKey.find(ev->Cookie);
436496
Y_ABORT_UNLESS(it != PutIdToKey.end());
@@ -554,14 +614,41 @@ namespace NKikimr::NBlobDepot {
554614
}
555615

556616
void TAssimilator::UpdateAssimilatorPosition() const {
557-
Self->AssimilatorPosition = TStringBuilder()
617+
Self->Assimilator.Position = TStringBuilder()
558618
<< "SkipBlocksUpTo# " << (SkipBlocksUpTo ? ToString(*SkipBlocksUpTo) : "<none>") << Endl
559619
<< "SkipBarriersUpTo# " << (SkipBarriersUpTo
560620
? TString(TStringBuilder() << std::get<0>(*SkipBarriersUpTo) << ':' << (int)std::get<1>(*SkipBarriersUpTo))
561621
: "<none>") << Endl
562622
<< "SkipBlobsUpTo# " << (SkipBlobsUpTo ? SkipBlobsUpTo->ToString() : "<none>");
563623
}
564624

625+
void TAssimilator::UpdateBytesCopiedQ() {
626+
while (BytesCopiedQ.size() >= 3) {
627+
BytesCopiedQ.pop_front();
628+
}
629+
BytesCopiedQ.emplace_back(TActivationContext::Monotonic(), Self->Assimilator.BytesCopied);
630+
631+
Self->Assimilator.CopySpeed = 0;
632+
Self->Assimilator.CopyTimeRemaining = TDuration::Max();
633+
634+
if (BytesCopiedQ.size() > 1) {
635+
const auto& [frontTs, frontBytes] = BytesCopiedQ.front();
636+
const auto& [backTs, backBytes] = BytesCopiedQ.back();
637+
const TDuration deltaTs = backTs - frontTs;
638+
const ui64 deltaBytes = backBytes - frontBytes;
639+
if (deltaTs != TDuration::Zero()) {
640+
Self->Assimilator.CopySpeed = deltaBytes * 1'000'000 / deltaTs.MicroSeconds();
641+
}
642+
if (deltaBytes) {
643+
Self->Assimilator.CopyTimeRemaining = TDuration::MicroSeconds(Self->Assimilator.BytesToCopy *
644+
deltaTs.MicroSeconds() / deltaBytes);
645+
}
646+
}
647+
648+
TActivationContext::Schedule(TDuration::Seconds(1), new IEventHandle(TEvPrivate::EvUpdateBytesCopiedQ, 0,
649+
SelfId(), {}, nullptr, 0));
650+
}
651+
565652
void TBlobDepot::TData::ExecuteTxCommitAssimilatedBlob(NKikimrProto::EReplyStatus status, TBlobSeqId blobSeqId,
566653
TData::TKey key, ui32 notifyEventType, TActorId parentId, ui64 cookie, bool keep, bool doNotKeep) {
567654
Self->Execute(std::make_unique<TTxCommitAssimilatedBlob>(Self, status, blobSeqId, std::move(key),

ydb/core/blob_depot/assimilator.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ namespace NKikimr::NBlobDepot {
1010
struct TEvPrivate {
1111
enum {
1212
EvResume = EventSpaceBegin(TEvents::ES_PRIVATE),
13+
EvResumeScanDataForPlanning,
1314
EvResumeScanDataForCopying,
1415
EvTxComplete,
16+
EvUpdateBytesCopiedQ,
1517
};
1618
};
1719

@@ -42,6 +44,12 @@ namespace NKikimr::NBlobDepot {
4244
bool ActionInProgress = false;
4345
bool ResumeScanDataForCopyingInFlight = false;
4446

47+
std::optional<TLogoBlobID> LastPlanScannedKey;
48+
bool PlanningComplete = false;
49+
bool ResumeScanDataForPlanningInFlight = false;
50+
51+
std::deque<std::tuple<TMonotonic, ui64>> BytesCopiedQ;
52+
4553
public:
4654
static constexpr NKikimrServices::TActivity::EType ActorActivityType() {
4755
return NKikimrServices::TActivity::BLOB_DEPOT_ASSIMILATOR_ACTOR;
@@ -62,6 +70,8 @@ namespace NKikimr::NBlobDepot {
6270
void Action();
6371
void SendAssimilateRequest();
6472
void Handle(TEvBlobStorage::TEvAssimilateResult::TPtr ev);
73+
void ScanDataForPlanning();
74+
void HandleResumeScanDataForPlanning();
6575
void ScanDataForCopying();
6676
void HandleResumeScanDataForCopying();
6777
void Handle(TEvBlobStorage::TEvGetResult::TPtr ev);
@@ -74,6 +84,7 @@ namespace NKikimr::NBlobDepot {
7484
void Handle(TEvBlobStorage::TEvControllerGroupDecommittedResponse::TPtr ev);
7585
TString SerializeAssimilatorState() const;
7686
void UpdateAssimilatorPosition() const;
87+
void UpdateBytesCopiedQ();
7788
};
7889

7990
} // NKikimrBlobDepot::NBlobDepot

ydb/core/blob_depot/blob_depot_tablet.h

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -290,17 +290,24 @@ namespace NKikimr::NBlobDepot {
290290
TActorId GroupAssimilatorId;
291291
EDecommitState DecommitState = EDecommitState::Default;
292292
std::optional<TString> AssimilatorState;
293-
TString AssimilatorPosition;
294-
TInstant AssimilatorLatestErrorGet;
295-
TInstant AssimilatorLatestOkGet;
296-
TInstant AssimilatorLatestErrorPut;
297-
TInstant AssimilatorLatestOkPut;
298-
TLogoBlobID AssimilatorLastReadBlobId;
299-
ui64 AssimilatorBlobsReadOk = 0;
300-
ui64 AssimilatorBlobsReadNoData = 0;
301-
ui64 AssimilatorBlobsReadError = 0;
302-
ui64 AssimilatorBlobsPutOk = 0;
303-
ui64 AssimilatorBlobsPutError = 0;
293+
struct {
294+
TString Position;
295+
TInstant LatestErrorGet;
296+
TInstant LatestOkGet;
297+
TInstant LatestErrorPut;
298+
TInstant LatestOkPut;
299+
TLogoBlobID LastReadBlobId;
300+
ui64 BytesToCopy = 0;
301+
ui64 BytesCopied = 0;
302+
ui64 CopySpeed = 0;
303+
TDuration CopyTimeRemaining = TDuration::Max();
304+
ui64 BlobsReadOk = 0;
305+
ui64 BlobsReadNoData = 0;
306+
ui64 BlobsReadError = 0;
307+
ui64 BlobsPutOk = 0;
308+
ui64 BlobsPutError = 0;
309+
ui32 CopyIteration = 0;
310+
} Assimilator;
304311

305312
class TGroupAssimilator;
306313

ydb/core/blob_depot/mon_main.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -456,19 +456,24 @@ namespace NKikimr::NBlobDepot {
456456
KEYVALUE_P("Now", TInstant::Now());
457457
KEYVALUE_P("Decommit state", DecommitState);
458458
KEYVALUE_P("Assimilator state", GroupAssimilatorId ? "running" : "stopped");
459-
KEYVALUE_P("Assimilator position", TStringBuilder() << "<pre>" << AssimilatorPosition << "<pre/>");
459+
KEYVALUE_P("Assimilator position", TStringBuilder() << "<pre>" << Assimilator.Position << "<pre/>");
460460
KEYVALUE_P("Last assimilated blob id", Data->LastAssimilatedBlobId ?
461461
Data->LastAssimilatedBlobId->ToString() : "<null>");
462-
KEYVALUE_P("Last read blob id", AssimilatorLastReadBlobId);
463-
KEYVALUE_P("Latest successful get", AssimilatorLatestOkGet);
464-
KEYVALUE_P("Latest erroneous get", AssimilatorLatestErrorGet);
465-
KEYVALUE_P("Latest successful put", AssimilatorLatestOkPut);
466-
KEYVALUE_P("Latest erroneous put", AssimilatorLatestErrorPut);
467-
KEYVALUE_P("Blobs read with OK", AssimilatorBlobsReadOk);
468-
KEYVALUE_P("Blobs read with NODATA", AssimilatorBlobsReadNoData);
469-
KEYVALUE_P("Blobs read with error", AssimilatorBlobsReadError);
470-
KEYVALUE_P("Blobs put with OK", AssimilatorBlobsPutOk);
471-
KEYVALUE_P("Blobs put with error", AssimilatorBlobsPutError);
462+
KEYVALUE_P("Copy iteration", Assimilator.CopyIteration);
463+
KEYVALUE_P("Bytes to copy", Assimilator.BytesToCopy);
464+
KEYVALUE_P("Bytes already copied", Assimilator.BytesCopied);
465+
KEYVALUE_P("Copy speed, bytes per second", Assimilator.CopySpeed);
466+
KEYVALUE_P("Copy time remaining", Assimilator.CopyTimeRemaining);
467+
KEYVALUE_P("Last read blob id", Assimilator.LastReadBlobId);
468+
KEYVALUE_P("Latest successful get", Assimilator.LatestOkGet);
469+
KEYVALUE_P("Latest erroneous get", Assimilator.LatestErrorGet);
470+
KEYVALUE_P("Latest successful put", Assimilator.LatestOkPut);
471+
KEYVALUE_P("Latest erroneous put", Assimilator.LatestErrorPut);
472+
KEYVALUE_P("Blobs read with OK", Assimilator.BlobsReadOk);
473+
KEYVALUE_P("Blobs read with NODATA", Assimilator.BlobsReadNoData);
474+
KEYVALUE_P("Blobs read with error", Assimilator.BlobsReadError);
475+
KEYVALUE_P("Blobs put with OK", Assimilator.BlobsPutOk);
476+
KEYVALUE_P("Blobs put with error", Assimilator.BlobsPutError);
472477
})
473478
}
474479
}

0 commit comments

Comments
 (0)