Skip to content

Add RowsCount/Rows and use it for stats purposes #15629

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ void TDqComputeActorChannels::HandleWork(TEvDqCompute::TEvChannelData::TPtr& ev)

TInputChannelState& inputChannel = InCh(channelId);

if (Y_UNLIKELY(channelData.Proto.GetData().GetRows() == 0 && channelData.Proto.GetData().GetChunks() > 0)) {
// For backward compatibility, to support communication with old nodes during rollback/migration
// Should be deleted eventually ~ mid 2025
channelData.Proto.MutableData()->SetRows(channelData.Proto.GetData().GetChunks());
}

LOG_T("Received input for channelId: " << channelId
<< ", seqNo: " << record.GetSeqNo()
<< ", size: " << channelData.Proto.GetData().GetRaw().size()
Expand Down
3 changes: 3 additions & 0 deletions ydb/library/yql/dq/common/dq_serialized_batch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,13 @@ TChunkedBuffer SaveForSpilling(TDqSerializedBatch&& batch) {

ui32 transportVersion = batch.Proto.GetTransportVersion();
ui32 chunkCount = batch.Proto.GetChunks();
ui32 rowCount = batch.Proto.GetRows();

TChunkedBuffer protoPayload(std::move(*batch.Proto.MutableRaw()));

AppendNumber(result, transportVersion);
AppendNumber(result, chunkCount);
AppendNumber(result, rowCount);
AppendNumber(result, protoPayload.Size());
result.Append(std::move(protoPayload));
AppendNumber(result, batch.Payload.Size());
Expand All @@ -85,6 +87,7 @@ TDqSerializedBatch LoadSpilled(TBuffer&& blob) {
TDqSerializedBatch result;
result.Proto.SetTransportVersion(ReadNumber<ui32>(source));
result.Proto.SetChunks(ReadNumber<ui32>(source));
result.Proto.SetRows(ReadNumber<ui32>(source));

size_t protoSize = ReadNumber<size_t>(source);
YQL_ENSURE(source.size() >= protoSize, "Premature end of spilled data");
Expand Down
2 changes: 1 addition & 1 deletion ydb/library/yql/dq/common/dq_serialized_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ struct TDqSerializedBatch {
}

ui32 RowCount() const {
return Proto.GetChunks(); // FIXME with Rows
return Proto.GetRows() ? Proto.GetRows() : Proto.GetChunks();
}

void Clear() {
Expand Down
1 change: 1 addition & 0 deletions ydb/library/yql/dq/proto/dq_transport.proto
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ message TData {
bytes Raw = 2;
uint32 Chunks = 3;
optional uint32 PayloadId = 4;
uint32 Rows = 5;
}
52 changes: 45 additions & 7 deletions ydb/library/yql/dq/runtime/dq_output_channel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,12 @@ class TDqOutputChannel : public IDqOutputChannel {
return;
}

ui32 rows = Packer.IsBlock() ?
NKikimr::NMiniKQL::TArrowBlock::From(values[width - 1]).GetDatum().scalar_as<arrow::UInt64Scalar>().value
: 1;

if (PushStats.CollectBasic()) {
PushStats.Rows++;
PushStats.Rows += rows;
PushStats.Chunks++;
PushStats.Resume();
}
Expand All @@ -111,6 +115,7 @@ class TDqOutputChannel : public IDqOutputChannel {
}

PackerCurrentChunkCount++;
PackerCurrentRowCount += rows;

size_t packerSize = Packer.PackedSizeEstimate();
if (packerSize >= MaxChunkBytes) {
Expand All @@ -121,8 +126,11 @@ class TDqOutputChannel : public IDqOutputChannel {
}
PackedDataSize += Data.back().Buffer.Size();
PackedChunkCount += PackerCurrentChunkCount;
PackedRowCount += PackerCurrentRowCount;
Data.back().ChunkCount = PackerCurrentChunkCount;
Data.back().RowCount = PackerCurrentRowCount;
PackerCurrentChunkCount = 0;
PackerCurrentRowCount = 0;
packerSize = 0;
}

Expand All @@ -134,11 +142,13 @@ class TDqOutputChannel : public IDqOutputChannel {
TDqSerializedBatch data;
data.Proto.SetTransportVersion(TransportVersion);
data.Proto.SetChunks(head.ChunkCount);
data.Proto.SetRows(head.RowCount);
data.SetPayload(std::move(head.Buffer));
Storage->Put(NextStoredId++, SaveForSpilling(std::move(data)));

PackedDataSize -= bufSize;
PackedChunkCount -= head.ChunkCount;
PackedRowCount -= head.RowCount;

SpilledChunkCount += head.ChunkCount;

Expand Down Expand Up @@ -199,22 +209,26 @@ class TDqOutputChannel : public IDqOutputChannel {
} else if (!Data.empty()) {
auto& packed = Data.front();
PackedChunkCount -= packed.ChunkCount;
PackedRowCount -= packed.RowCount;
PackedDataSize -= packed.Buffer.Size();
data.Proto.SetChunks(packed.ChunkCount);
data.Proto.SetRows(packed.RowCount);
data.SetPayload(std::move(packed.Buffer));
Data.pop_front();
} else {
data.Proto.SetChunks(PackerCurrentChunkCount);
data.Proto.SetRows(PackerCurrentRowCount);
data.SetPayload(FinishPackAndCheckSize());
PackerCurrentChunkCount = 0;
PackerCurrentRowCount = 0;
}

DLOG("Took " << data.RowCount() << " rows");

if (PopStats.CollectBasic()) {
PopStats.Bytes += data.Size();
PopStats.Rows += data.RowCount();
PopStats.Chunks++;
PopStats.Chunks++; // pop chunks do not match push chunks
if (!IsFull() || FirstStoredId == NextStoredId) {
PopStats.Resume();
}
Expand Down Expand Up @@ -257,28 +271,43 @@ class TDqOutputChannel : public IDqOutputChannel {
data.Proto.SetTransportVersion(TransportVersion);
if (SpilledChunkCount == 0 && PackedChunkCount == 0) {
data.Proto.SetChunks(PackerCurrentChunkCount);
data.Proto.SetRows(PackerCurrentRowCount);
data.SetPayload(FinishPackAndCheckSize());
if (PushStats.CollectBasic()) {
PushStats.Bytes += data.Payload.Size();
}
PackerCurrentChunkCount = 0;
PackerCurrentRowCount = 0;
return true;
}

// Repack all - thats why PopAll should never be used
if (PackerCurrentChunkCount) {
Data.emplace_back();
Data.back().Buffer = FinishPackAndCheckSize();
if (PushStats.CollectBasic()) {
PushStats.Bytes += Data.back().Buffer.Size();
}
PackedDataSize += Data.back().Buffer.Size();
PackedChunkCount += PackerCurrentChunkCount;
PackedRowCount += PackerCurrentRowCount;
Data.back().ChunkCount = PackerCurrentChunkCount;
Data.back().RowCount = PackerCurrentRowCount;
PackerCurrentChunkCount = 0;
PackerCurrentRowCount = 0;
}

NKikimr::NMiniKQL::TUnboxedValueBatch rows(OutputType);
size_t repackedChunkCount = 0;
size_t repackedRowCount = 0;
for (;;) {
TDqSerializedBatch chunk;
if (!this->Pop(chunk)) {
TDqSerializedBatch batch;
if (!this->Pop(batch)) {
break;
}
Packer.UnpackBatch(chunk.PullPayload(), HolderFactory, rows);
repackedChunkCount += batch.ChunkCount();
repackedRowCount += batch.RowCount();
Packer.UnpackBatch(batch.PullPayload(), HolderFactory, rows);
}

if (OutputType->IsMulti()) {
Expand All @@ -291,7 +320,8 @@ class TDqOutputChannel : public IDqOutputChannel {
});
}

data.Proto.SetChunks(rows.RowCount()); // 1 UVB "row" is Chunk
data.Proto.SetChunks(repackedChunkCount);
data.Proto.SetRows(repackedRowCount);
data.SetPayload(FinishPackAndCheckSize());
if (PopStats.CollectBasic()) {
PopStats.Bytes += data.Size();
Expand Down Expand Up @@ -332,7 +362,12 @@ class TDqOutputChannel : public IDqOutputChannel {
ui64 rows = GetValuesCount();
Data.clear();
Packer.Clear();
SpilledChunkCount = PackedDataSize = PackedChunkCount = PackerCurrentChunkCount = 0;
PackedDataSize = 0;
PackedChunkCount = 0;
PackedRowCount = 0;
SpilledChunkCount = 0;
PackerCurrentChunkCount = 0;
PackerCurrentRowCount = 0;
FirstStoredId = NextStoredId;
return rows;
}
Expand All @@ -359,6 +394,7 @@ class TDqOutputChannel : public IDqOutputChannel {
struct TSerializedBatch {
TChunkedBuffer Buffer;
ui64 ChunkCount = 0;
ui64 RowCount = 0;
};
std::deque<TSerializedBatch> Data;

Expand All @@ -368,8 +404,10 @@ class TDqOutputChannel : public IDqOutputChannel {

size_t PackedDataSize = 0;
size_t PackedChunkCount = 0;
size_t PackedRowCount = 0;

size_t PackerCurrentChunkCount = 0;
size_t PackerCurrentRowCount = 0;

bool Finished = false;

Expand Down
3 changes: 3 additions & 0 deletions ydb/library/yql/dq/runtime/dq_transport.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ TDqSerializedBatch SerializeValue(NDqProto::EDataTransportVersion version, const
TDqSerializedBatch result;
result.Proto.SetTransportVersion(version);
result.Proto.SetChunks(1);
result.Proto.SetRows(1);
result.SetPayload(std::move(packResult));
return result;
}
Expand Down Expand Up @@ -88,6 +89,7 @@ TDqSerializedBatch SerializeBuffer(NDqProto::EDataTransportVersion version, cons
TDqSerializedBatch result;
result.Proto.SetTransportVersion(version);
result.Proto.SetChunks(buffer.RowCount());
result.Proto.SetRows(buffer.RowCount()); // maybe incorrect for Arrow Blocks
result.SetPayload(std::move(packResult));
return result;
}
Expand Down Expand Up @@ -177,6 +179,7 @@ NDqProto::TData TDqDataSerializer::SerializeParamValue(const TType* type, const
data.SetTransportVersion(NDqProto::DATA_TRANSPORT_UV_PICKLE_1_0);
data.SetRaw(packResult.data(), packResult.size());
data.SetChunks(1);
data.SetRows(1);

return data;
}
Expand Down
Loading