Skip to content

Commit 07df631

Browse files
authored
Fix metrics debug log; change metrics client log level to reduce spam (#478)
1 parent 6b117df commit 07df631

File tree

5 files changed

+26
-8
lines changed

5 files changed

+26
-8
lines changed

pkg/epp/backend/metrics/fake.go

+4
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ type FakePodMetrics struct {
3434
Metrics *Metrics
3535
}
3636

37+
func (fpm *FakePodMetrics) String() string {
38+
return fmt.Sprintf("Pod: %v; Metrics: %v", fpm.GetPod(), fpm.GetMetrics())
39+
}
40+
3741
func (fpm *FakePodMetrics) GetPod() *Pod {
3842
return fpm.Pod
3943
}

pkg/epp/backend/metrics/logger.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package metrics
1818

1919
import (
2020
"context"
21+
"fmt"
2122
"time"
2223

2324
"github.com/go-logr/logr"
@@ -76,7 +77,8 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
7677
podsWithStaleMetrics := datastore.PodList(func(pm PodMetrics) bool {
7778
return time.Since(pm.GetMetrics().UpdateTime) > metricsValidityPeriod
7879
})
79-
logger.Info("Current Pods and metrics gathered", "fresh metrics", podsWithFreshMetrics, "stale metrics", podsWithStaleMetrics)
80+
s := fmt.Sprintf("Current Pods and metrics gathered. Fresh metrics: %+v, Stale metrics: %+v", podsWithFreshMetrics, podsWithStaleMetrics)
81+
logger.Info(s)
8082
}
8183
}
8284
}()

pkg/epp/backend/metrics/pod_metrics.go

+5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package metrics
1818

1919
import (
2020
"context"
21+
"fmt"
2122
"sync"
2223
"sync/atomic"
2324
"time"
@@ -52,6 +53,10 @@ type PodMetricsClient interface {
5253
FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error)
5354
}
5455

56+
func (pm *podMetrics) String() string {
57+
return fmt.Sprintf("Pod: %v; Metrics: %v", pm.GetPod(), pm.GetMetrics())
58+
}
59+
5560
func (pm *podMetrics) GetPod() *Pod {
5661
return (*Pod)(atomic.LoadPointer(&pm.pod))
5762
}

pkg/epp/backend/metrics/types.go

+8
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,21 @@ type PodMetrics interface {
6262
GetMetrics() *Metrics
6363
UpdatePod(*corev1.Pod)
6464
StopRefreshLoop()
65+
String() string
6566
}
6667

6768
type Pod struct {
6869
NamespacedName types.NamespacedName
6970
Address string
7071
}
7172

73+
func (p *Pod) String() string {
74+
if p == nil {
75+
return ""
76+
}
77+
return fmt.Sprintf("%+v", *p)
78+
}
79+
7280
type Metrics struct {
7381
// ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU.
7482
ActiveModels map[string]int

pkg/epp/backend/vllm/metrics.go

+6-7
Original file line numberDiff line numberDiff line change
@@ -61,29 +61,28 @@ func (p *PodMetricsClientImpl) FetchMetrics(
6161
existing *metrics.Metrics,
6262
port int32,
6363
) (*metrics.Metrics, error) {
64-
logger := log.FromContext(ctx)
65-
loggerDefault := logger.V(logutil.DEFAULT)
64+
logger := log.FromContext(ctx).V(logutil.TRACE)
6665

6766
// Currently the metrics endpoint is hard-coded, which works with vLLM.
6867
// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config.
6968
url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics"
7069

7170
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
7271
if err != nil {
73-
loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
72+
logger.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
7473
return nil, fmt.Errorf("failed to create request: %v", err)
7574
}
7675
resp, err := http.DefaultClient.Do(req)
7776
if err != nil {
78-
loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName)
77+
logger.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName)
7978
return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err)
8079
}
8180
defer func() {
8281
_ = resp.Body.Close()
8382
}()
8483

8584
if resp.StatusCode != http.StatusOK {
86-
loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode)
85+
logger.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode)
8786
return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode)
8887
}
8988

@@ -172,7 +171,7 @@ func promToPodMetrics(
172171
func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
173172
loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
174173
if !ok {
175-
logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
174+
logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
176175
return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
177176
}
178177

@@ -219,7 +218,7 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
219218
func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) {
220219
mf, ok := metricFamilies[metricName]
221220
if !ok {
222-
logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", metricName)
221+
logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", metricName)
223222
return nil, fmt.Errorf("metric family %q not found", metricName)
224223
}
225224
if len(mf.GetMetric()) == 0 {

0 commit comments

Comments
 (0)