Skip to content

Commit ff259d8

Browse files
committed
add request metrics
1 parent 918960c commit ff259d8

File tree

8 files changed

+161
-3
lines changed

8 files changed

+161
-3
lines changed

go.mod

+3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ require (
2121
k8s.io/apimachinery v0.31.3
2222
k8s.io/client-go v0.31.3
2323
k8s.io/code-generator v0.31.3
24+
k8s.io/component-base v0.31.3
2425
k8s.io/klog/v2 v2.130.1
2526
sigs.k8s.io/controller-runtime v0.19.3
2627
sigs.k8s.io/structured-merge-diff/v4 v4.4.3
@@ -35,6 +36,7 @@ require (
3536
github.com/Masterminds/sprig/v3 v3.2.3 // indirect
3637
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect
3738
github.com/beorn7/perks v1.0.1 // indirect
39+
github.com/blang/semver/v4 v4.0.0 // indirect
3840
github.com/bufbuild/protocompile v0.14.1 // indirect
3941
github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect
4042
github.com/cespare/xxhash/v2 v2.3.0 // indirect
@@ -63,6 +65,7 @@ require (
6365
github.com/josharian/intern v1.0.0 // indirect
6466
github.com/json-iterator/go v1.1.12 // indirect
6567
github.com/klauspost/compress v1.17.9 // indirect
68+
github.com/kylelemons/godebug v1.1.0 // indirect
6669
github.com/mailru/easyjson v0.7.7 // indirect
6770
github.com/mitchellh/copystructure v1.0.0 // indirect
6871
github.com/mitchellh/reflectwalk v1.0.1 // indirect

go.sum

+4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafo
1515
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
1616
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
1717
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
18+
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
19+
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
1820
github.com/bojand/ghz v0.120.0 h1:6F4wsmZVwFg5UnD+/R+IABWk6sKE/0OKIBdUQUZnOdo=
1921
github.com/bojand/ghz v0.120.0/go.mod h1:HfECuBZj1v02XObGnRuoZgyB1PR24/25dIYiJIMjJnE=
2022
github.com/bufbuild/protocompile v0.14.1 h1:iA73zAf/fyljNjQKwYzUHD6AD4R8KMasmwa/FBatYVw=
@@ -264,6 +266,8 @@ k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4=
264266
k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs=
265267
k8s.io/code-generator v0.31.3 h1:Pj0fYOBms+ZrsulLi4DMsCEx1jG8fWKRLy44onHsLBI=
266268
k8s.io/code-generator v0.31.3/go.mod h1:/umCIlT84g1+Yu5ZXtP1KGSRTnGiIzzX5AzUAxsNlts=
269+
k8s.io/component-base v0.31.3 h1:DMCXXVx546Rfvhj+3cOm2EUxhS+EyztH423j+8sOwhQ=
270+
k8s.io/component-base v0.31.3/go.mod h1:xME6BHfUOafRgT0rGVBGl7TuSg8Z9/deT7qq6w7qjIU=
267271
k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 h1:NGrVE502P0s0/1hudf8zjgwki1X/TByhmAoILTarmzo=
268272
k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70/go.mod h1:VH3AT8AaQOqiGjMF9p0/IM1Dj+82ZwjfxUP1IxaHE+8=
269273
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=

pkg/ext-proc/metrics/metrics.go

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
package metrics
2+
3+
import (
4+
"sync"
5+
"time"
6+
7+
compbasemetrics "k8s.io/component-base/metrics"
8+
"k8s.io/component-base/metrics/legacyregistry"
9+
)
10+
11+
const (
12+
LLMServiceModelComponent = "llmservice_model"
13+
)
14+
15+
var (
16+
requestCounter = compbasemetrics.NewCounterVec(
17+
&compbasemetrics.CounterOpts{
18+
Subsystem: LLMServiceModelComponent,
19+
Name: "request_total",
20+
Help: "Counter of LLM service requests broken out for each model and target model.",
21+
StabilityLevel: compbasemetrics.ALPHA,
22+
},
23+
[]string{"llmservice_name", "model_name", "target_model_name"},
24+
)
25+
26+
requestLatencies = compbasemetrics.NewHistogramVec(
27+
&compbasemetrics.HistogramOpts{
28+
Subsystem: LLMServiceModelComponent,
29+
Name: "request_duration_seconds",
30+
Help: "LLM service response latency distribution in seconds for each model and target model.",
31+
Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
32+
4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600},
33+
StabilityLevel: compbasemetrics.ALPHA,
34+
},
35+
[]string{"llmservice_name", "model_name", "target_model_name"},
36+
)
37+
38+
requestSizes = compbasemetrics.NewHistogramVec(
39+
&compbasemetrics.HistogramOpts{
40+
Subsystem: LLMServiceModelComponent,
41+
Name: "request_sizes",
42+
Help: "LLM service requests size distribution in bytes for each model and target model.",
43+
// Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB).
44+
Buckets: compbasemetrics.ExponentialBuckets(1000, 10.0, 7),
45+
StabilityLevel: compbasemetrics.ALPHA,
46+
},
47+
[]string{"llmservice_name", "model_name", "target_model_name"},
48+
)
49+
)
50+
51+
var registerMetrics sync.Once
52+
53+
// Register all metrics.
54+
func Register() {
55+
registerMetrics.Do(func() {
56+
legacyregistry.MustRegister(requestCounter)
57+
legacyregistry.MustRegister(requestLatencies)
58+
legacyregistry.MustRegister(requestSizes)
59+
})
60+
}
61+
62+
// MonitorRequest handles monitoring requests.
63+
func MonitorRequest(llmserviceName, modelName, targetModelName string, reqSize int, elapsed time.Duration) {
64+
elapsedSeconds := elapsed.Seconds()
65+
requestCounter.WithLabelValues(llmserviceName, modelName, targetModelName).Inc()
66+
requestLatencies.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(elapsedSeconds)
67+
requestSizes.WithLabelValues(llmserviceName, modelName, targetModelName).Observe(float64(reqSize))
68+
}

pkg/ext-proc/metrics/metrics_test.go

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
package metrics
2+
3+
import (
4+
"os"
5+
"testing"
6+
"time"
7+
8+
"k8s.io/component-base/metrics/legacyregistry"
9+
"k8s.io/component-base/metrics/testutil"
10+
)
11+
12+
const RequestTotalMetric = LLMServiceModelComponent + "_request_total"
13+
14+
func TestMonitorRequest(t *testing.T) {
15+
type requests struct {
16+
llmserviceName string
17+
modelName string
18+
targetModelName string
19+
reqSize int
20+
elapsed time.Duration
21+
}
22+
scenarios := []struct {
23+
name string
24+
reqs []requests
25+
}{{
26+
name: "multiple requests",
27+
reqs: []requests{
28+
{
29+
llmserviceName: "s10",
30+
modelName: "m10",
31+
targetModelName: "t10",
32+
reqSize: 10,
33+
elapsed: time.Millisecond * 10,
34+
},
35+
{
36+
llmserviceName: "s10",
37+
modelName: "m10",
38+
targetModelName: "t10",
39+
reqSize: 20,
40+
elapsed: time.Millisecond * 20,
41+
},
42+
{
43+
llmserviceName: "s10",
44+
modelName: "m10",
45+
targetModelName: "t11",
46+
reqSize: 30,
47+
elapsed: time.Millisecond * 30,
48+
},
49+
{
50+
llmserviceName: "s20",
51+
modelName: "m20",
52+
targetModelName: "t20",
53+
reqSize: 40,
54+
elapsed: time.Millisecond * 40,
55+
},
56+
},
57+
}}
58+
Register()
59+
for _, scenario := range scenarios {
60+
t.Run(scenario.name, func(t *testing.T) {
61+
for _, req := range scenario.reqs {
62+
MonitorRequest(req.llmserviceName, req.modelName, req.targetModelName, req.reqSize, req.elapsed)
63+
}
64+
wantRequestTotal, err := os.Open("testdata/request_total_metric")
65+
defer func() {
66+
if err := wantRequestTotal.Close(); err != nil {
67+
t.Error(err)
68+
}
69+
}()
70+
if err != nil {
71+
t.Fatal(err)
72+
}
73+
if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestTotal, RequestTotalMetric); err != nil {
74+
t.Error(err)
75+
}
76+
})
77+
}
78+
}

pkg/ext-proc/metrics/testdata/request_duration_seconds_metic

Whitespace-only changes.

pkg/ext-proc/metrics/testdata/request_sizes_metric

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# HELP llmservice_model_request_total [ALPHA] Counter of LLM service requests broken out for each model and target model.
2+
# TYPE llmservice_model_request_total counter
3+
llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t10"} 2
4+
llmservice_model_request_total{llmservice_name="s10", model_name="m10", target_model_name="t11"} 1
5+
llmservice_model_request_total{llmservice_name="s20", model_name="m20", target_model_name="t20"} 1

pkg/ext-proc/scheduling/filter.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,9 @@ func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*bac
157157
type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool
158158

159159
// We consider serving an adapter low cost it the adapter is active in the model server, or the
160-
// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by spreading the
161-
// load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to a single pod.
162-
// This gave good performance in our initial benchmarking results in the scenario where # of lora slots > # of lora adapters.
160+
// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by spreading the
161+
// load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to a single pod.
162+
// This gave good performance in our initial benchmarking results in the scenario where # of lora slots > # of lora adapters.
163163
func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
164164
_, ok := pod.ActiveModels[req.ResolvedTargetModel]
165165
return ok || len(pod.ActiveModels) < pod.MaxActiveModels

0 commit comments

Comments
 (0)