Skip to content

Commit 59da1af

Browse files
[memprof] Speed up caller-callee pair extraction (#116184)
We know that the MemProf profile has a lot of duplicate call stacks. Extracting caller-callee pairs from a call stack we've seen before is a wasteful effort. This patch makes the extraction more efficient by first coming up with a work list of linear call stack IDs -- the set of starting positions in the radix tree array -- and then extract caller-callee pairs from each call stack in the work list. We implement the work list as a bit vector because we expect the work list to be dense in the range [0, RadixTreeSize). Also, we want the set insertion to be cheap. Without this patch, it takes 25 seconds to extract caller-callee pairs from a large MemProf profile. This patch shortenes that down to 4 seconds.
1 parent d761b74 commit 59da1af

File tree

3 files changed

+36
-4
lines changed

3 files changed

+36
-4
lines changed

llvm/include/llvm/ProfileData/InstrProfReader.h

+2
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,8 @@ class IndexedMemProfReader {
683683
const unsigned char *FrameBase = nullptr;
684684
/// The starting address of the call stack array.
685685
const unsigned char *CallStackBase = nullptr;
686+
// The number of elements in the radix tree array.
687+
unsigned RadixTreeSize = 0;
686688

687689
Error deserializeV012(const unsigned char *Start, const unsigned char *Ptr,
688690
uint64_t FirstWord);

llvm/lib/ProfileData/InstrProfReader.cpp

+18-1
Original file line numberDiff line numberDiff line change
@@ -1303,6 +1303,12 @@ Error IndexedMemProfReader::deserializeV3(const unsigned char *Start,
13031303
FrameBase = Ptr;
13041304
CallStackBase = Start + CallStackPayloadOffset;
13051305

1306+
// Compute the number of elements in the radix tree array. Since we use this
1307+
// to reserve enough bits in a BitVector, it's totally OK if we overestimate
1308+
// this number a little bit because of padding just before the next section.
1309+
RadixTreeSize = (RecordPayloadOffset - CallStackPayloadOffset) /
1310+
sizeof(memprof::LinearFrameId);
1311+
13061312
// Now initialize the table reader with a pointer into data buffer.
13071313
MemProfRecordTable.reset(MemProfRecordHashTable::Create(
13081314
/*Buckets=*/Start + RecordTableOffset,
@@ -1674,11 +1680,22 @@ IndexedMemProfReader::getMemProfCallerCalleePairs() const {
16741680
memprof::LinearFrameIdConverter FrameIdConv(FrameBase);
16751681
memprof::CallerCalleePairExtractor Extractor(CallStackBase, FrameIdConv);
16761682

1683+
// The set of linear call stack IDs that we need to traverse from. We expect
1684+
// the set to be dense, so we use a BitVector.
1685+
BitVector Worklist(RadixTreeSize);
1686+
1687+
// Collect the set of linear call stack IDs. Since we expect a lot of
1688+
// duplicates, we first collect them in the form of a bit vector before
1689+
// processing them.
16771690
for (const memprof::IndexedMemProfRecord &IndexedRecord :
16781691
MemProfRecordTable->data())
16791692
for (const memprof::IndexedAllocationInfo &IndexedAI :
16801693
IndexedRecord.AllocSites)
1681-
Extractor(IndexedAI.CSId);
1694+
Worklist.set(IndexedAI.CSId);
1695+
1696+
// Collect caller-callee pairs for each linear call stack ID in Worklist.
1697+
for (unsigned CS : Worklist.set_bits())
1698+
Extractor(CS);
16821699

16831700
DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> Pairs =
16841701
std::move(Extractor.CallerCalleePairs);

llvm/lib/ProfileData/InstrProfWriter.cpp

+16-3
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,8 @@ writeMemProfCallStackArray(
601601
&MemProfCallStackData,
602602
llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
603603
&MemProfFrameIndexes,
604-
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
604+
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram,
605+
unsigned &NumElements) {
605606
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
606607
MemProfCallStackIndexes;
607608

@@ -610,6 +611,7 @@ writeMemProfCallStackArray(
610611
FrameHistogram);
611612
for (auto I : Builder.getRadixArray())
612613
OS.write32(I);
614+
NumElements = Builder.getRadixArray().size();
613615
MemProfCallStackIndexes = Builder.takeCallStackPos();
614616

615617
// Release the memory of this vector as it is no longer needed.
@@ -771,15 +773,26 @@ static Error writeMemProfV3(ProfOStream &OS,
771773
writeMemProfFrameArray(OS, MemProfData.Frames, FrameHistogram);
772774

773775
uint64_t CallStackPayloadOffset = OS.tell();
776+
// The number of elements in the call stack array.
777+
unsigned NumElements = 0;
774778
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
775-
MemProfCallStackIndexes = writeMemProfCallStackArray(
776-
OS, MemProfData.CallStacks, MemProfFrameIndexes, FrameHistogram);
779+
MemProfCallStackIndexes =
780+
writeMemProfCallStackArray(OS, MemProfData.CallStacks,
781+
MemProfFrameIndexes, FrameHistogram,
782+
NumElements);
777783

778784
uint64_t RecordPayloadOffset = OS.tell();
779785
uint64_t RecordTableOffset =
780786
writeMemProfRecords(OS, MemProfData.Records, &Schema, memprof::Version3,
781787
&MemProfCallStackIndexes);
782788

789+
// IndexedMemProfReader::deserializeV3 computes the number of elements in the
790+
// call stack array from the difference between CallStackPayloadOffset and
791+
// RecordPayloadOffset. Verify that the computation works.
792+
assert(CallStackPayloadOffset +
793+
NumElements * sizeof(memprof::LinearFrameId) ==
794+
RecordPayloadOffset);
795+
783796
uint64_t Header[] = {
784797
CallStackPayloadOffset,
785798
RecordPayloadOffset,

0 commit comments

Comments
 (0)