-
Notifications
You must be signed in to change notification settings - Fork 646
feat: repair Go stack traces #3014
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
dd64084
9d575f5
a8683a2
9391970
ca2efe8
1546fc0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,266 @@ | ||
package pprof | ||
|
||
import ( | ||
"bytes" | ||
"reflect" | ||
"sort" | ||
"unsafe" | ||
|
||
"golang.org/x/exp/slices" | ||
|
||
profilev1 "github.com/grafana/pyroscope/api/gen/proto/go/google/v1" | ||
) | ||
|
||
const ( | ||
minGroupSize = 2 | ||
|
||
tokens = 8 | ||
tokenLen = 16 | ||
suffixLen = tokens + tokenLen | ||
|
||
tokenBytesLen = tokenLen * 8 | ||
suffixBytesLen = suffixLen * 8 | ||
) | ||
|
||
// MayHaveGoHeapTruncatedStacktraces reports whether there are | ||
// any chances that the profile may have truncated stack traces. | ||
func MayHaveGoHeapTruncatedStacktraces(p *profilev1.Profile) bool { | ||
if !hasGoHeapSampleTypes(p) { | ||
return false | ||
} | ||
// Some truncated stacks have depth less than the depth limit (32). | ||
const minDepth = 28 | ||
for _, s := range p.Sample { | ||
if len(s.LocationId) >= minDepth { | ||
return true | ||
} | ||
} | ||
Comment on lines
+33
to
+37
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See https://go-review.googlesource.com/c/go/+/572396: we also need to check that the depth does not exceed the limit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do think the fact that truncation has happened could be propagated more clearly: golang/go#43669 |
||
return false | ||
} | ||
|
||
func hasGoHeapSampleTypes(p *profilev1.Profile) bool { | ||
for _, st := range p.SampleType { | ||
switch p.StringTable[st.Type] { | ||
case | ||
"alloc_objects", | ||
"alloc_space", | ||
"inuse_objects", | ||
"inuse_space": | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
// RepairGoHeapTruncatedStacktraces repairs truncated stack traces | ||
// in Go heap profiles. | ||
// | ||
// Go heap profile has a depth limit of 32 frames, which often | ||
// renders profiles unreadable, and also increases cardinality | ||
// of stack traces. | ||
// | ||
// The function guesses truncated frames based on neighbors and | ||
// repairs stack traces if there are high chances that this | ||
// part is present in the profile. The heuristic is as follows: | ||
// | ||
// For each stack trace S taller than 24 frames: if there is another | ||
// stack trace R taller than 24 frames that overlaps with the given | ||
// one by at least 16 frames in a row from the top, and has frames | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why 16 and not less ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There should be a reasonable overlap between stack traces. Half of the max looks good to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should test in dev with different value of that configuration just to see the impact. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Less overlap size implies more tokens. The more tokens, the more checks we need to perform, the procedure becomes more expensive. In the meantime, with a shorter overlap, the chance that we will "repair" a wrong stack is higher. Therefore I'm not sure if it's worth it. I experimented with various values and can say that the current ones work well. Thus, I would prefer to deploy this version and optimize it if we deem it necessary. |
||
// above its root, stack S considered truncated, and the missing part | ||
// is copied from R. | ||
func RepairGoHeapTruncatedStacktraces(p *profilev1.Profile) { | ||
// Group stack traces by bottom (closest to the root) locations. | ||
// Typically, there are very few groups (a hundred or two). | ||
samples, groups := split(p) | ||
// Each group's suffix is then tokenized: each part is shifted by one | ||
// location from the previous one (like n-grams). | ||
// Tokens are written into the token=>group map, Where the value is the | ||
// index of the group with the token found at the furthest position from | ||
// the root (across all groups). | ||
m := make(map[string]group, len(groups)/2) | ||
for i := 0; i < len(groups); i++ { | ||
g := groups[i] | ||
n := len(groups) | ||
if i+1 < len(groups) { | ||
n = groups[i+1] | ||
} | ||
if s := n - g; s < minGroupSize { | ||
continue | ||
} | ||
// We take suffix of the first sample in the group. | ||
s := suffix(samples[g].LocationId) | ||
// Tokenize the suffix: token position is relative to the stack | ||
// trace root: 0 means that the token is the closest to the root. | ||
// TODO: unroll? | ||
// 0 : 64 : 192 // No need. | ||
// 1 : 56 : 184 | ||
// 2 : 48 : 176 | ||
// 3 : 40 : 168 | ||
// 4 : 32 : 160 | ||
// 5 : 24 : 152 | ||
// 6 : 16 : 144 | ||
// 7 : 8 : 136 | ||
// 8 : 0 : 128 | ||
// | ||
// We skip the top/right-most token, as it is not needed, | ||
// because there can be no more complete stack trace. | ||
for j := uint32(1); j <= tokens; j++ { | ||
hi := suffixBytesLen - j*tokens | ||
lo := hi - tokenBytesLen | ||
// By taking a string representation of the slice, | ||
// we eliminate the need to hash the token explicitly: | ||
// Go map will do it this way or another. | ||
k := unsafeString(s[lo:hi]) | ||
// Current candidate: the group where the token is | ||
// located at the furthest position from the root. | ||
c, ok := m[k] | ||
if !ok || j > c.off { | ||
// This group has more complete stack traces: | ||
m[k] = group{ | ||
gid: uint32(i), | ||
off: j, | ||
} | ||
} | ||
} | ||
} | ||
|
||
// Now we handle chaining. Consider the following stacks: | ||
// 1 2 3 4 | ||
// a b [d] (f) | ||
// b c [e] (g) | ||
// c [d] (f) h | ||
// d [e] (g) i | ||
// | ||
// We can't associate 3-rd stack with the 1-st one because their tokens | ||
// do not overlap (given the token size is 2). However, we can associate | ||
// it transitively through the 2nd stack. | ||
// | ||
// Dependencies: | ||
// - group i depends on d[i]. | ||
// - d[i] depends on d[d[i].gid]. | ||
d := make([]group, len(groups)) | ||
for i := 0; i < len(groups); i++ { | ||
g := groups[i] | ||
t := topToken(samples[g].LocationId) | ||
k := unsafeString(t) | ||
c, ok := m[k] | ||
if !ok || c.off == 0 || groups[c.gid] == g { | ||
// The current group has the most complete stack trace. | ||
continue | ||
} | ||
d[i] = c | ||
} | ||
|
||
// Then, for each group, we test, if there is another group with a more | ||
// complete suffix, overlapping with the given one by at least one token. | ||
// If such stack trace exists, all stack traces of the group are appended | ||
// with the missing part. | ||
for i := 0; i < len(groups); i++ { | ||
kolesnikovae marked this conversation as resolved.
Show resolved
Hide resolved
|
||
g := groups[i] | ||
c := d[i] | ||
var off uint32 | ||
for c.off > 0 { | ||
off += c.off | ||
n := d[c.gid] | ||
if n.off == 0 { | ||
// Stop early to preserve c. | ||
break | ||
} | ||
c = n | ||
} | ||
if off == 0 { | ||
// The current group has the most complete stack trace. | ||
continue | ||
} | ||
// The reference stack trace. | ||
appx := samples[groups[c.gid]].LocationId | ||
// It's possible that the reference stack trace does not | ||
// include the part we're looking for. In this case, we | ||
// simply ignore the group. Although it's possible to infer | ||
// this piece from other stacks, this is left for further | ||
// improvements. | ||
if int(off) >= len(appx) { | ||
continue | ||
} | ||
appx = appx[uint32(len(appx))-off:] | ||
// Now we append the missing part to all stack traces of the group. | ||
n := len(groups) | ||
if i+1 < len(groups) { | ||
n = groups[i+1] | ||
} | ||
for j := g; j < n; j++ { | ||
// Locations typically already have some extra capacity, | ||
// therefore no major allocations are expected here. | ||
samples[j].LocationId = append(samples[j].LocationId, appx...) | ||
} | ||
} | ||
} | ||
|
||
type group struct { | ||
gid uint32 | ||
off uint32 | ||
} | ||
|
||
// suffix returns the last suffixLen locations | ||
// of the given stack trace represented as bytes. | ||
// The return slice is always suffixBytesLen long. | ||
// function panics if s is shorter than suffixLen. | ||
func suffix(s []uint64) []byte { | ||
return locBytes(s[len(s)-suffixLen:]) | ||
} | ||
|
||
// topToken returns the last tokenLen locations | ||
// of the given stack trace represented as bytes. | ||
// The return slice is always tokenBytesLen long. | ||
// function panics if s is shorter than tokenLen. | ||
func topToken(s []uint64) []byte { | ||
return locBytes(s[len(s)-tokenLen:]) | ||
} | ||
|
||
func locBytes(s []uint64) []byte { | ||
size := len(s) * 8 | ||
h := (*reflect.SliceHeader)(unsafe.Pointer(&s)) | ||
h.Len = size | ||
h.Cap = size | ||
return *(*[]byte)(unsafe.Pointer(h)) | ||
} | ||
|
||
func unsafeString(b []byte) string { | ||
return *(*string)(unsafe.Pointer(&b)) | ||
} | ||
|
||
// split into groups of samples by stack trace suffixes. | ||
// Return slice contains indices of the first sample | ||
// of each group in the collection of selected samples. | ||
func split(p *profilev1.Profile) ([]*profilev1.Sample, []int) { | ||
slices.SortFunc(p.Sample, func(a, b *profilev1.Sample) int { | ||
if len(a.LocationId) < suffixLen { | ||
return -1 | ||
} | ||
if len(b.LocationId) < suffixLen { | ||
return 1 | ||
} | ||
return bytes.Compare( | ||
suffix(a.LocationId), | ||
suffix(b.LocationId), | ||
) | ||
}) | ||
o := sort.Search(len(p.Sample), func(i int) bool { | ||
return len(p.Sample[i].LocationId) >= suffixLen | ||
}) | ||
if o == len(p.Sample) { | ||
return nil, nil | ||
} | ||
samples := p.Sample[o:] | ||
const avgGroupSize = 16 // Estimate. | ||
groups := make([]int, 0, len(samples)/avgGroupSize) | ||
var prev []byte | ||
for i := 0; i < len(samples); i++ { | ||
cur := suffix(samples[i].LocationId) | ||
if !bytes.Equal(cur, prev) { | ||
groups = append(groups, i) | ||
prev = cur | ||
} | ||
} | ||
return samples, groups | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
package pprof | ||
|
||
import ( | ||
"os" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func Benchmark_RepairGoTruncatedStacktraces(b *testing.B) { | ||
p, err := OpenFile("testdata/goheapfix/heap_go_truncated_3.pb.gz") | ||
require.NoError(b, err) | ||
b.ResetTimer() | ||
b.ReportAllocs() | ||
for i := 0; i < b.N; i++ { | ||
RepairGoHeapTruncatedStacktraces(FixGoProfile(p.CloneVT())) | ||
} | ||
} | ||
|
||
func Test_UpdateFixtures_RepairGoTruncatedStacktraces(t *testing.T) { | ||
t.Skip() | ||
t.Helper() | ||
paths := []string{ | ||
"testdata/goheapfix/heap_go_truncated_1.pb.gz", // Cortex. | ||
"testdata/goheapfix/heap_go_truncated_2.pb.gz", // Cortex. | ||
"testdata/goheapfix/heap_go_truncated_3.pb.gz", // Loki. Pathological. | ||
"testdata/goheapfix/heap_go_truncated_4.pb.gz", // Pyroscope. | ||
} | ||
for _, path := range paths { | ||
func() { | ||
p, err := OpenFile(path) | ||
require.NoError(t, err, path) | ||
defer p.Close() | ||
f, err := os.Create(path + ".fixed") | ||
require.NoError(t, err, path) | ||
defer f.Close() | ||
p.Profile = FixGoProfile(p.Profile) | ||
RepairGoHeapTruncatedStacktraces(p.Profile) | ||
_, err = p.WriteTo(f) | ||
require.NoError(t, err, path) | ||
}() | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.