Skip to content

Commit fca1f18

Browse files
Wide combiner spill during state split (#9785)
1 parent e96a66e commit fca1f18

File tree

1 file changed

+118
-17
lines changed

1 file changed

+118
-17
lines changed

ydb/library/yql/minikql/comp_nodes/mkql_wide_combine.cpp

Lines changed: 118 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
360360

361361
enum class EOperatingMode {
362362
InMemory,
363+
SplittingState,
363364
Spilling,
364365
ProcessSpilled
365366
};
@@ -378,6 +379,7 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
378379
Extract,
379380
Finish
380381
};
382+
381383
TSpillingSupportState(
382384
TMemoryUsageInfo* memInfo,
383385
const TMultiType* usedInputItemType, const TMultiType* keyAndStateType, ui32 keyWidth, size_t itemNodesSize,
@@ -403,7 +405,9 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
403405
}
404406

405407
EUpdateResult Update() {
406-
if (IsEverythingExtracted) return EUpdateResult::Finish;
408+
if (IsEverythingExtracted) {
409+
return EUpdateResult::Finish;
410+
}
407411

408412
switch (GetMode()) {
409413
case EOperatingMode::InMemory: {
@@ -415,11 +419,16 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
415419

416420
return EUpdateResult::ReadInput;
417421
}
422+
case EOperatingMode::SplittingState: {
423+
if (SplitStateIntoBucketsAndWait()) return EUpdateResult::Yield;
424+
return Update();
425+
}
418426
case EOperatingMode::Spilling: {
419427
UpdateSpillingBuckets();
420428

429+
421430
if (!HasMemoryForProcessing() && InputStatus != EFetchResult::Finish && TryToReduceMemoryAndWait()) {
422-
return EUpdateResult::Yield;
431+
return EUpdateResult::Yield;
423432
}
424433

425434
if (BufferForUsedInputItems.size()) {
@@ -522,13 +531,65 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
522531
return ProcessSpilledData();
523532
}
524533

525-
void SplitStateIntoBuckets() {
526-
while (const auto keyAndState = static_cast<NUdf::TUnboxedValue *>(InMemoryProcessingState.Extract())) {
534+
ui32 GetLargestInMemoryBucketNumber() const {
535+
ui64 maxSize = 0;
536+
ui32 largestInMemoryBucketNum = (ui32)-1;
537+
for (ui64 i = 0; i < SpilledBucketCount; ++i) {
538+
if (SpilledBuckets[i].BucketState == TSpilledBucket::EBucketState::InMemory) {
539+
if (SpilledBuckets[i].LineCount >= maxSize) {
540+
largestInMemoryBucketNum = i;
541+
maxSize = SpilledBuckets[i].LineCount;
542+
}
543+
}
544+
}
545+
return largestInMemoryBucketNum;
546+
}
547+
548+
bool IsSpillingWhileStateSplitAllowed() const {
549+
// TODO: Write better condition here. For example: InMemorybuckets > 64
550+
return true;
551+
}
552+
553+
bool SplitStateIntoBucketsAndWait() {
554+
if (SplitStateSpillingBucket != -1) {
555+
auto& bucket = SpilledBuckets[SplitStateSpillingBucket];
556+
MKQL_ENSURE(bucket.AsyncWriteOperation.has_value(), "Internal logic error");
557+
if (!bucket.AsyncWriteOperation->HasValue()) return true;
558+
bucket.SpilledState->AsyncWriteCompleted(bucket.AsyncWriteOperation->ExtractValue());
559+
bucket.AsyncWriteOperation = std::nullopt;
560+
561+
while (const auto keyAndState = static_cast<NUdf::TUnboxedValue*>(bucket.InMemoryProcessingState->Extract())) {
562+
bucket.AsyncWriteOperation = bucket.SpilledState->WriteWideItem({keyAndState, KeyAndStateType->GetElementsCount()});
563+
for (size_t i = 0; i < KeyAndStateType->GetElementsCount(); ++i) {
564+
//releasing values stored in unsafe TUnboxedValue buffer
565+
keyAndState[i].UnRef();
566+
}
567+
if (bucket.AsyncWriteOperation) return true;
568+
}
569+
570+
SplitStateSpillingBucket = -1;
571+
}
572+
while (const auto keyAndState = static_cast<NUdf::TUnboxedValue *>(InMemoryProcessingState.Extract())) {
527573
auto hash = Hasher(keyAndState); //Hasher uses only key for hashing
528574
auto bucketId = hash % SpilledBucketCount;
529575
auto& bucket = SpilledBuckets[bucketId];
530576

531577
bucket.LineCount++;
578+
579+
if (bucket.BucketState != TSpilledBucket::EBucketState::InMemory) {
580+
bucket.BucketState = TSpilledBucket::EBucketState::SpillingState;
581+
bucket.AsyncWriteOperation = bucket.SpilledState->WriteWideItem({keyAndState, KeyAndStateType->GetElementsCount()});
582+
for (size_t i = 0; i < KeyAndStateType->GetElementsCount(); ++i) {
583+
//releasing values stored in unsafe TUnboxedValue buffer
584+
keyAndState[i].UnRef();
585+
}
586+
if (bucket.AsyncWriteOperation) {
587+
SplitStateSpillingBucket = bucketId;
588+
return true;
589+
}
590+
continue;
591+
}
592+
532593
auto& processingState = *bucket.InMemoryProcessingState;
533594

534595
for (size_t i = 0; i < KeyWidth; ++i) {
@@ -540,16 +601,58 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
540601
//jumping into unsafe world, refusing ownership
541602
static_cast<NUdf::TUnboxedValue&>(processingState.Throat[i - KeyWidth]) = std::move(keyAndState[i]);
542603
}
604+
605+
if (InMemoryBucketsCount && !HasMemoryForProcessing() && IsSpillingWhileStateSplitAllowed()) {
606+
ui32 bucketNumToSpill = GetLargestInMemoryBucketNumber();
607+
608+
SplitStateSpillingBucket = bucketNumToSpill;
609+
InMemoryBucketsCount--;
610+
611+
auto& bucket = SpilledBuckets[bucketNumToSpill];
612+
bucket.BucketState = TSpilledBucket::EBucketState::SpillingState;
613+
614+
while (const auto keyAndState = static_cast<NUdf::TUnboxedValue*>(bucket.InMemoryProcessingState->Extract())) {
615+
bucket.AsyncWriteOperation = bucket.SpilledState->WriteWideItem({keyAndState, KeyAndStateType->GetElementsCount()});
616+
for (size_t i = 0; i < KeyAndStateType->GetElementsCount(); ++i) {
617+
//releasing values stored in unsafe TUnboxedValue buffer
618+
keyAndState[i].UnRef();
619+
}
620+
if (bucket.AsyncWriteOperation) return true;
621+
}
622+
623+
bucket.AsyncWriteOperation = bucket.SpilledState->FinishWriting();
624+
if (bucket.AsyncWriteOperation) return true;
625+
}
626+
}
627+
628+
for (ui64 i = 0; i < SpilledBucketCount; ++i) {
629+
auto& bucket = SpilledBuckets[i];
630+
if (bucket.BucketState == TSpilledBucket::EBucketState::SpillingState) {
631+
if (bucket.AsyncWriteOperation.has_value()) {
632+
if (!bucket.AsyncWriteOperation->HasValue()) return true;
633+
bucket.SpilledState->AsyncWriteCompleted(bucket.AsyncWriteOperation->ExtractValue());
634+
bucket.AsyncWriteOperation = std::nullopt;
635+
}
636+
637+
bucket.AsyncWriteOperation = bucket.SpilledState->FinishWriting();
638+
if (bucket.AsyncWriteOperation) return true;
639+
bucket.InMemoryProcessingState->ReadMore<false>();
640+
641+
bucket.BucketState = TSpilledBucket::EBucketState::SpillingData;
642+
}
543643
}
544644

545645
InMemoryProcessingState.ReadMore<false>();
646+
IsInMemoryProcessingStateSplitted = true;
647+
SwitchMode(EOperatingMode::Spilling);
648+
return false;
546649
}
547650

548651
bool CheckMemoryAndSwitchToSpilling() {
549652
if (AllowSpilling && Ctx.SpillerFactory && IsSwitchToSpillingModeCondition()) {
550653
LogMemoryUsage();
551654

552-
SwitchMode(EOperatingMode::Spilling);
655+
SwitchMode(EOperatingMode::SplittingState);
553656
return true;
554657
}
555658

@@ -619,15 +722,7 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
619722
return true;
620723
}
621724
while (InMemoryBucketsCount > 0) {
622-
ui64 maxLineCount = 0;
623-
ui32 maxLineBucketInd = (ui32)-1;
624-
for (ui64 i = 0; i < SpilledBucketCount; ++i) {
625-
const auto& bucket = SpilledBuckets[i];
626-
if (bucket.BucketState == TSpilledBucket::EBucketState::InMemory && (maxLineBucketInd == (ui32)-1 || bucket.LineCount > maxLineCount)) {
627-
maxLineCount = bucket.LineCount;
628-
maxLineBucketInd = i;
629-
}
630-
}
725+
ui32 maxLineBucketInd = GetLargestInMemoryBucketNumber();
631726
MKQL_ENSURE(maxLineBucketInd != (ui32)-1, "Internal logic error");
632727

633728
auto& bucketToSpill = SpilledBuckets[maxLineBucketInd];
@@ -701,8 +796,8 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
701796
MKQL_ENSURE(false, "Internal logic error");
702797
break;
703798
}
704-
case EOperatingMode::Spilling: {
705-
YQL_LOG(INFO) << "switching Memory mode to Spilling";
799+
case EOperatingMode::SplittingState: {
800+
YQL_LOG(INFO) << "switching Memory mode to SplittingState";
706801
MKQL_ENSURE(EOperatingMode::InMemory == Mode, "Internal logic error");
707802
SpilledBuckets.resize(SpilledBucketCount);
708803
auto spiller = Ctx.SpillerFactory->CreateSpiller();
@@ -711,7 +806,11 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
711806
b.SpilledData = std::make_unique<TWideUnboxedValuesSpillerAdapter>(spiller, UsedInputItemType, 5_MB);
712807
b.InMemoryProcessingState = std::make_unique<TState>(MemInfo, KeyWidth, KeyAndStateType->GetElementsCount() - KeyWidth, Hasher, Equal);
713808
}
714-
SplitStateIntoBuckets();
809+
break;
810+
}
811+
case EOperatingMode::Spilling: {
812+
YQL_LOG(INFO) << "switching Memory mode to Spilling";
813+
MKQL_ENSURE(EOperatingMode::SplittingState == Mode || EOperatingMode::InMemory == Mode, "Internal logic error");
715814

716815
Tongue = ViewForKeyAndState.data();
717816
break;
@@ -744,6 +843,7 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
744843
bool IsEverythingExtracted = false;
745844

746845
TState InMemoryProcessingState;
846+
bool IsInMemoryProcessingStateSplitted = false;
747847
const TMultiType* const UsedInputItemType;
748848
const TMultiType* const KeyAndStateType;
749849
const size_t KeyWidth;
@@ -760,6 +860,7 @@ class TSpillingSupportState : public TComputationValue<TSpillingSupportState> {
760860
ui64 BufferForUsedInputItemsBucketId;
761861
TUnboxedValueVector BufferForUsedInputItems;
762862
std::vector<NUdf::TUnboxedValuePod, TMKQLAllocator<NUdf::TUnboxedValuePod>> ViewForKeyAndState;
863+
i64 SplitStateSpillingBucket = -1;
763864

764865
TMemoryUsageInfo* MemInfo = nullptr;
765866
TEqualsFunc const Equal;

0 commit comments

Comments
 (0)