Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix metrics debug log; change metrics client log level to reduce spam #478

Merged
merged 1 commit into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/epp/backend/metrics/fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ type FakePodMetrics struct {
Metrics *Metrics
}

func (fpm *FakePodMetrics) String() string {
return fmt.Sprintf("Pod: %v; Metrics: %v", fpm.GetPod(), fpm.GetMetrics())
}

func (fpm *FakePodMetrics) GetPod() *Pod {
return fpm.Pod
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/epp/backend/metrics/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package metrics

import (
"context"
"fmt"
"time"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -76,7 +77,8 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
podsWithStaleMetrics := datastore.PodList(func(pm PodMetrics) bool {
return time.Since(pm.GetMetrics().UpdateTime) > metricsValidityPeriod
})
logger.Info("Current Pods and metrics gathered", "fresh metrics", podsWithFreshMetrics, "stale metrics", podsWithStaleMetrics)
s := fmt.Sprintf("Current Pods and metrics gathered. Fresh metrics: %+v, Stale metrics: %+v", podsWithFreshMetrics, podsWithStaleMetrics)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logger.Info didn't properly handle []PodMetrics, which is an interface. It always prints empty {}. So I changed to fmt.Sprintf.

logger.Info(s)
}
}
}()
Expand Down
5 changes: 5 additions & 0 deletions pkg/epp/backend/metrics/pod_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package metrics

import (
"context"
"fmt"
"sync"
"sync/atomic"
"time"
Expand Down Expand Up @@ -52,6 +53,10 @@ type PodMetricsClient interface {
FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error)
}

func (pm *podMetrics) String() string {
return fmt.Sprintf("Pod: %v; Metrics: %v", pm.GetPod(), pm.GetMetrics())
}

func (pm *podMetrics) GetPod() *Pod {
return (*Pod)(atomic.LoadPointer(&pm.pod))
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/epp/backend/metrics/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,21 @@ type PodMetrics interface {
GetMetrics() *Metrics
UpdatePod(*corev1.Pod)
StopRefreshLoop()
String() string
}

type Pod struct {
NamespacedName types.NamespacedName
Address string
}

func (p *Pod) String() string {
if p == nil {
return ""
}
return fmt.Sprintf("%+v", *p)
}

type Metrics struct {
// ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU.
ActiveModels map[string]int
Expand Down
13 changes: 6 additions & 7 deletions pkg/epp/backend/vllm/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,29 +61,28 @@ func (p *PodMetricsClientImpl) FetchMetrics(
existing *metrics.Metrics,
port int32,
) (*metrics.Metrics, error) {
logger := log.FromContext(ctx)
loggerDefault := logger.V(logutil.DEFAULT)
logger := log.FromContext(ctx).V(logutil.TRACE)

// Currently the metrics endpoint is hard-coded, which works with vLLM.
// TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config.
url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics"

req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
logger.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url)
return nil, fmt.Errorf("failed to create request: %v", err)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName)
logger.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName)
return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err)
}
defer func() {
_ = resp.Body.Close()
}()

if resp.StatusCode != http.StatusOK {
loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode)
logger.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode)
return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode)
}

Expand Down Expand Up @@ -172,7 +171,7 @@ func promToPodMetrics(
func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) {
loraRequests, ok := metricFamilies[LoraRequestInfoMetricName]
if !ok {
logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName)
return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName)
}

Expand Down Expand Up @@ -219,7 +218,7 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr
func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) {
mf, ok := metricFamilies[metricName]
if !ok {
logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", metricName)
logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", metricName)
return nil, fmt.Errorf("metric family %q not found", metricName)
}
if len(mf.GetMetric()) == 0 {
Expand Down