Skip to content

Commit 103b4be

Browse files
committed
Add pressure stall information metrics
issues: #3052, #3083, kubernetes/enhancements#4205 This change adds metrics for pressure stall information, that indicate why some or all tasks of a cgroupv2 have waited due to resource congestion (cpu, memory, io). The change exposes this information by including the _PSIStats_ of each controller in it's stats, i.e. _CPUStats.PSI_, _MemoryStats.PSI_ and _DiskStats.PSI_. The information is additionally exposed as Prometheus metrics. The metrics follow the naming outlined by the prometheus/node-exporter, where stalled eq full and waiting eq some. ``` container_pressure_cpu_stalled_seconds_total container_pressure_cpu_waiting_seconds_total container_pressure_memory_stalled_seconds_total container_pressure_memory_waiting_seconds_total container_pressure_io_stalled_seconds_total container_pressure_io_waiting_seconds_total ``` Signed-off-by: Felix Ehrenpfort <[email protected]>
1 parent 6b23ac7 commit 103b4be

File tree

10 files changed

+152
-241
lines changed

10 files changed

+152
-241
lines changed

Diff for: cmd/cadvisor_test.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,7 @@ func TestToIncludedMetrics(t *testing.T) {
112112
container.ResctrlMetrics: struct{}{},
113113
container.CPUSetMetrics: struct{}{},
114114
container.OOMMetrics: struct{}{},
115-
container.PSITotalMetrics: struct{}{},
116-
container.PSIAvgMetrics: struct{}{},
115+
container.PressureMetrics: struct{}{},
117116
},
118117
container.AllMetrics,
119118
{},

Diff for: cmd/go.mod

-1
Original file line numberDiff line numberDiff line change
@@ -130,4 +130,3 @@ require (
130130
gopkg.in/yaml.v3 v3.0.1 // indirect
131131
)
132132

133-
replace github.com/opencontainers/runc => github.com/dqminh/runc v0.0.0-20220513155811-6414629ada8a

Diff for: container/factory.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,7 @@ const (
6666
ResctrlMetrics MetricKind = "resctrl"
6767
CPUSetMetrics MetricKind = "cpuset"
6868
OOMMetrics MetricKind = "oom_event"
69-
PSITotalMetrics MetricKind = "psi_total"
70-
PSIAvgMetrics MetricKind = "psi_avg"
69+
PressureMetrics MetricKind = "pressure"
7170
)
7271

7372
// AllMetrics represents all kinds of metrics that cAdvisor supported.
@@ -93,8 +92,7 @@ var AllMetrics = MetricSet{
9392
ResctrlMetrics: struct{}{},
9493
CPUSetMetrics: struct{}{},
9594
OOMMetrics: struct{}{},
96-
PSITotalMetrics: struct{}{},
97-
PSIAvgMetrics: struct{}{},
95+
PressureMetrics: struct{}{},
9896
}
9997

10098
// AllNetworkMetrics represents all network metrics that cAdvisor supports.

Diff for: container/libcontainer/handler.go

+19-20
Original file line numberDiff line numberDiff line change
@@ -763,20 +763,6 @@ func (h *Handler) GetProcesses() ([]int, error) {
763763
return pids, nil
764764
}
765765

766-
// Convert libcontainer cgroups.PSIData to info.PSIData
767-
func convertPSIData(from *cgroups.PSIData, to *info.PSIData) {
768-
to.Avg10 = from.Avg10
769-
to.Avg60 = from.Avg60
770-
to.Avg300 = from.Avg300
771-
to.Total = from.Total
772-
}
773-
774-
// Convert libcontainer cgroups.PSIStats to info.PSIStats
775-
func convertPSI(from *cgroups.PSIStats, to *info.PSIStats) {
776-
convertPSIData(&from.Some, &to.Some)
777-
convertPSIData(&from.Full, &to.Full)
778-
}
779-
780766
// Convert libcontainer stats to info.ContainerStats.
781767
func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) {
782768
ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode
@@ -785,8 +771,7 @@ func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) {
785771
ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
786772
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
787773
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
788-
789-
convertPSI(&s.CpuStats.PSI, &ret.Cpu.PSI)
774+
setPSIStats(s.CpuStats.PSI, &ret.Cpu.PSI)
790775

791776
if !withPerCPU {
792777
return
@@ -808,17 +793,15 @@ func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) {
808793
ret.DiskIo.IoWaitTime = diskStatsCopy(s.BlkioStats.IoWaitTimeRecursive)
809794
ret.DiskIo.IoMerged = diskStatsCopy(s.BlkioStats.IoMergedRecursive)
810795
ret.DiskIo.IoTime = diskStatsCopy(s.BlkioStats.IoTimeRecursive)
811-
812-
convertPSI(&s.BlkioStats.PSI, &ret.DiskIo.PSI)
796+
setPSIStats(s.BlkioStats.PSI, &ret.DiskIo.PSI)
813797
}
814798

815799
func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
816800
ret.Memory.Usage = s.MemoryStats.Usage.Usage
817801
ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage
818802
ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
819803
ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage
820-
821-
convertPSI(&s.MemoryStats.PSI, &ret.Memory.PSI)
804+
setPSIStats(s.MemoryStats.PSI, &ret.Memory.PSI)
822805

823806
if cgroups.IsCgroup2UnifiedMode() {
824807
ret.Memory.Cache = s.MemoryStats.Stats["file"]
@@ -904,6 +887,22 @@ func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) {
904887
}
905888
}
906889

890+
func setPSIData(d *cgroups.PSIData, ret *info.PSIData) {
891+
if d != nil {
892+
ret.Total = d.Total
893+
ret.Avg10 = d.Avg10
894+
ret.Avg60 = d.Avg60
895+
ret.Avg300 = d.Avg300
896+
}
897+
}
898+
899+
func setPSIStats(s *cgroups.PSIStats, ret *info.PSIStats) {
900+
if s != nil {
901+
setPSIData(&s.Full, &ret.Full)
902+
setPSIData(&s.Some, &ret.Some)
903+
}
904+
}
905+
907906
// read from pids path not cpu
908907
func setThreadsStats(s *cgroups.Stats, ret *info.ContainerStats) {
909908
if s != nil {

Diff for: container/libcontainer/handler_test.go

+15-15
Original file line numberDiff line numberDiff line change
@@ -110,17 +110,17 @@ func TestSetCPUStats(t *testing.T) {
110110
UsageInKernelmode: 734746 * nanosecondsInSeconds / clockTicks,
111111
UsageInUsermode: 2767637 * nanosecondsInSeconds / clockTicks,
112112
},
113-
PSI: cgroups.PSIStats{
114-
Some: cgroups.PSIData{
115-
Avg10: 0.1,
113+
PSI: &cgroups.PSIStats{
114+
Full: cgroups.PSIData{
115+
Avg10: 0.3,
116116
Avg60: 0.2,
117-
Avg300: 0.3,
117+
Avg300: 0.1,
118118
Total: 100,
119119
},
120-
Full: cgroups.PSIData{
121-
Avg10: 0.4,
122-
Avg60: 0.5,
123-
Avg300: 0.6,
120+
Some: cgroups.PSIData{
121+
Avg10: 0.6,
122+
Avg60: 0.4,
123+
Avg300: 0.2,
124124
Total: 200,
125125
},
126126
},
@@ -138,16 +138,16 @@ func TestSetCPUStats(t *testing.T) {
138138
Total: 33802947350272,
139139
},
140140
PSI: info.PSIStats{
141-
Some: info.PSIData{
142-
Avg10: 0.1,
141+
Full: info.PSIData{
142+
Avg10: 0.3,
143143
Avg60: 0.2,
144-
Avg300: 0.3,
144+
Avg300: 0.1,
145145
Total: 100,
146146
},
147-
Full: info.PSIData{
148-
Avg10: 0.4,
149-
Avg60: 0.5,
150-
Avg300: 0.6,
147+
Some: info.PSIData{
148+
Avg10: 0.6,
149+
Avg60: 0.4,
150+
Avg300: 0.2,
151151
Total: 200,
152152
},
153153
},

Diff for: info/v1/container.go

+20-14
Original file line numberDiff line numberDiff line change
@@ -261,16 +261,24 @@ func (ci *ContainerInfo) StatsEndTime() time.Time {
261261
return ret
262262
}
263263

264-
type PSIData struct {
265-
Avg10 float64 `json:"avg10"`
266-
Avg60 float64 `json:"avg60"`
267-
Avg300 float64 `json:"avg300"`
268-
Total uint64 `json:"total"`
269-
}
270-
264+
// PSI statistics for an individual resource.
271265
type PSIStats struct {
272-
Some PSIData `json:"some,omitempty"`
266+
// PSI data for all tasks of in the cgroup.
273267
Full PSIData `json:"full,omitempty"`
268+
// PSI data for some tasks in the cgroup.
269+
Some PSIData `json:"some,omitempty"`
270+
}
271+
272+
type PSIData struct {
273+
// Total time duration for tasks in the cgroup have waited due to congestion.
274+
// Unit: nanoseconds.
275+
Total uint64 `json:"total"`
276+
// The average (in %) tasks have waited due to congestion over a 10 second window.
277+
Avg10 float64 `json:"avg10"`
278+
// The average (in %) tasks have waited due to congestion over a 60 second window.
279+
Avg60 float64 `json:"avg60"`
280+
// The average (in %) tasks have waited due to congestion over a 300 second window.
281+
Avg300 float64 `json:"avg300"`
274282
}
275283

276284
// This mirrors kernel internal structure.
@@ -346,9 +354,8 @@ type CpuStats struct {
346354
// from LoadStats.NrRunning.
347355
LoadAverage int32 `json:"load_average"`
348356
// from LoadStats.NrUninterruptible
349-
LoadDAverage int32 `json:"load_d_average"`
350-
351-
PSI PSIStats `json:"psi,omitempty"`
357+
LoadDAverage int32 `json:"load_d_average"`
358+
PSI PSIStats `json:"psi"`
352359
}
353360

354361
type PerDiskStats struct {
@@ -367,8 +374,7 @@ type DiskIoStats struct {
367374
IoWaitTime []PerDiskStats `json:"io_wait_time,omitempty"`
368375
IoMerged []PerDiskStats `json:"io_merged,omitempty"`
369376
IoTime []PerDiskStats `json:"io_time,omitempty"`
370-
371-
PSI PSIStats `json:"psi,omitempty"`
377+
PSI PSIStats `json:"psi"`
372378
}
373379

374380
type HugetlbStats struct {
@@ -428,7 +434,7 @@ type MemoryStats struct {
428434
ContainerData MemoryStatsMemoryData `json:"container_data,omitempty"`
429435
HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"`
430436

431-
PSI PSIStats `json:"psi,omitempty"`
437+
PSI PSIStats `json:"psi"`
432438
}
433439

434440
type CPUSetStats struct {

Diff for: metrics/prometheus.go

+34-64
Original file line numberDiff line numberDiff line change
@@ -1746,64 +1746,54 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
17461746
})
17471747
}
17481748

1749-
if includedMetrics.Has(container.PSITotalMetrics) {
1749+
if includedMetrics.Has(container.PressureMetrics) {
17501750
c.containerMetrics = append(c.containerMetrics, []containerMetric{
17511751
{
1752-
name: "container_cpu_psi_total_seconds",
1753-
help: "Total time spent under cpu pressure in seconds.",
1754-
valueType: prometheus.CounterValue,
1755-
extraLabels: []string{"kind"},
1752+
name: "container_pressure_cpu_stalled_seconds_total",
1753+
help: "Total time duration no tasks in the container could make progress due to CPU congestion.",
1754+
valueType: prometheus.CounterValue,
17561755
getValues: func(s *info.ContainerStats) metricValues {
1757-
return getPSIValues(s, &s.Cpu.PSI, "total")
1756+
return metricValues{{value: float64(s.Cpu.PSI.Full.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}}
17581757
},
17591758
}, {
1760-
name: "container_memory_psi_total_seconds",
1761-
help: "Total container time spent under memory pressure in seconds.",
1762-
valueType: prometheus.CounterValue,
1763-
extraLabels: []string{"kind"},
1759+
name: "container_pressure_cpu_waiting_seconds_total",
1760+
help: "Total time duration tasks in the container have waited due to CPU congestion.",
1761+
valueType: prometheus.CounterValue,
17641762
getValues: func(s *info.ContainerStats) metricValues {
1765-
return getPSIValues(s, &s.Memory.PSI, "total")
1763+
return metricValues{{value: float64(s.Cpu.PSI.Some.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}}
17661764
},
17671765
}, {
1768-
name: "container_io_psi_total_seconds",
1769-
help: "Total time spent under io pressure in seconds.",
1770-
valueType: prometheus.CounterValue,
1771-
extraLabels: []string{"kind"},
1766+
name: "container_pressure_memory_stalled_seconds_total",
1767+
help: "Total time duration no tasks in the container could make progress due to memory congestion.",
1768+
valueType: prometheus.CounterValue,
1769+
getValues: func(s *info.ContainerStats) metricValues {
1770+
return metricValues{{value: float64(s.Memory.PSI.Full.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}}
1771+
},
1772+
}, {
1773+
name: "container_pressure_memory_waiting_seconds_total",
1774+
help: "Total time duration tasks in the container have waited due to memory congestion.",
1775+
valueType: prometheus.CounterValue,
1776+
getValues: func(s *info.ContainerStats) metricValues {
1777+
return metricValues{{value: float64(s.Memory.PSI.Some.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}}
1778+
},
1779+
}, {
1780+
name: "container_pressure_io_stalled_seconds_total",
1781+
help: "Total time duration no tasks in the container could make progress due to IO congestion.",
1782+
valueType: prometheus.CounterValue,
1783+
getValues: func(s *info.ContainerStats) metricValues {
1784+
return metricValues{{value: float64(s.DiskIo.PSI.Full.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}}
1785+
},
1786+
}, {
1787+
name: "container_pressure_io_waiting_seconds_total",
1788+
help: "Total time duration tasks in the container have waited due to IO congestion.",
1789+
valueType: prometheus.CounterValue,
17721790
getValues: func(s *info.ContainerStats) metricValues {
1773-
return getPSIValues(s, &s.DiskIo.PSI, "total")
1791+
return metricValues{{value: float64(s.DiskIo.PSI.Some.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}}
17741792
},
17751793
},
17761794
}...)
17771795
}
17781796

1779-
if includedMetrics.Has(container.PSIAvgMetrics) {
1780-
makePSIAvgMetric := func(controller, window string) containerMetric {
1781-
return containerMetric{
1782-
name: fmt.Sprintf("container_%s_psi_avg%s_ratio", controller, window),
1783-
help: fmt.Sprintf("Ratio of time spent under %s pressure over time window of %s seconds", controller, window),
1784-
valueType: prometheus.GaugeValue,
1785-
extraLabels: []string{"kind"},
1786-
getValues: func(s *info.ContainerStats) metricValues {
1787-
switch controller {
1788-
case "cpu":
1789-
return getPSIValues(s, &s.Cpu.PSI, "avg"+window)
1790-
case "memory":
1791-
return getPSIValues(s, &s.Memory.PSI, "avg"+window)
1792-
case "io":
1793-
return getPSIValues(s, &s.DiskIo.PSI, "avg"+window)
1794-
default:
1795-
return nil
1796-
}
1797-
},
1798-
}
1799-
}
1800-
for _, controller := range []string{"cpu", "memory", "io"} {
1801-
for _, window := range []string{"10", "60", "300"} {
1802-
c.containerMetrics = append(c.containerMetrics, makePSIAvgMetric(controller, window))
1803-
}
1804-
}
1805-
}
1806-
18071797
return c
18081798
}
18091799

@@ -2096,23 +2086,3 @@ func getMinCoreScalingRatio(s *info.ContainerStats) metricValues {
20962086
}
20972087
return values
20982088
}
2099-
2100-
func getPSIValues(s *info.ContainerStats, psi *info.PSIStats, psiMetric string) metricValues {
2101-
v := make(metricValues, 0, 2)
2102-
switch psiMetric {
2103-
case "avg10":
2104-
v = append(v, metricValue{value: psi.Some.Avg10, timestamp: s.Timestamp, labels: []string{"some"}})
2105-
v = append(v, metricValue{value: psi.Full.Avg10, timestamp: s.Timestamp, labels: []string{"full"}})
2106-
case "avg60":
2107-
v = append(v, metricValue{value: psi.Some.Avg60, timestamp: s.Timestamp, labels: []string{"some"}})
2108-
v = append(v, metricValue{value: psi.Full.Avg60, timestamp: s.Timestamp, labels: []string{"full"}})
2109-
case "avg300":
2110-
v = append(v, metricValue{value: psi.Some.Avg300, timestamp: s.Timestamp, labels: []string{"some"}})
2111-
v = append(v, metricValue{value: psi.Full.Avg300, timestamp: s.Timestamp, labels: []string{"full"}})
2112-
case "total":
2113-
// total is measured as microseconds
2114-
v = append(v, metricValue{value: float64(time.Duration(psi.Some.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"some"}})
2115-
v = append(v, metricValue{value: float64(time.Duration(psi.Full.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"full"}})
2116-
}
2117-
return v
2118-
}

0 commit comments

Comments
 (0)