|
| 1 | +package backend |
| 2 | + |
| 3 | +import ( |
| 4 | + "fmt" |
| 5 | + "strings" |
| 6 | + "sync" |
| 7 | + "time" |
| 8 | + |
| 9 | + dto "github.com/prometheus/client_model/go" |
| 10 | + "go.uber.org/multierr" |
| 11 | + klog "k8s.io/klog/v2" |
| 12 | +) |
| 13 | + |
| 14 | +const ( |
| 15 | + ActiveLoRAAdaptersMetricName = "vllm:info_active_adapters_info" |
| 16 | + LoRAAdapterPendingRequestMetricName = "vllm:active_lora_adapters" |
| 17 | + // TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork. |
| 18 | + RunningQueueSizeMetricName = "vllm:num_requests_running" |
| 19 | + WaitingQueueSizeMetricName = "vllm:num_requests_waiting" |
| 20 | + /* TODO: Uncomment this once the following are added to the fork. |
| 21 | + RunningQueueSizeMetricName = "vllm:num_tokens_running" |
| 22 | + WaitingQueueSizeMetricName = "vllm:num_tokens_waiting" |
| 23 | + */ |
| 24 | + KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc" |
| 25 | + KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity" |
| 26 | +) |
| 27 | + |
| 28 | +func (p *Provider) refreshMetricsOnce() error { |
| 29 | + start := time.Now() |
| 30 | + defer func() { |
| 31 | + d := time.Now().Sub(start) |
| 32 | + // TODO: add a metric instead of logging |
| 33 | + klog.V(3).Infof("Refreshed metrics in %v", d) |
| 34 | + }() |
| 35 | + var wg sync.WaitGroup |
| 36 | + var errs error |
| 37 | + processOnePod := func(key, value any) bool { |
| 38 | + pod := key.(Pod) |
| 39 | + metrics := value.(*PodMetrics) |
| 40 | + wg.Add(1) |
| 41 | + go func() { |
| 42 | + defer wg.Done() |
| 43 | + metricFamilies, err := p.pmc.FetchMetrics(pod) |
| 44 | + if err != nil { |
| 45 | + multierr.Append(errs, fmt.Errorf("failed to parse metrics from %s: %v", pod, err)) |
| 46 | + return |
| 47 | + } |
| 48 | + updated, err := promToPodMetrics(metricFamilies, metrics) |
| 49 | + klog.V(3).Infof("Updated metrics for pod %s: %v", pod, updated.Metrics) |
| 50 | + if err != nil { |
| 51 | + multierr.Append(errs, fmt.Errorf("failed to get all pod metrics updated from prometheus: %v", err)) |
| 52 | + } |
| 53 | + p.UpdatePodMetrics(pod, updated) |
| 54 | + }() |
| 55 | + return true |
| 56 | + } |
| 57 | + p.podMetrics.Range(processOnePod) |
| 58 | + wg.Wait() |
| 59 | + return errs |
| 60 | +} |
| 61 | + |
| 62 | +// promToPodMetrics updates internal pod metrics with scraped prometheus metrics. |
| 63 | +// A combined error is returned if errors occur in one or more metric processing. |
| 64 | +// it returns a new PodMetrics pointer which can be used to atomically update the pod metrics map. |
| 65 | +func promToPodMetrics(metricFamilies map[string]*dto.MetricFamily, existing *PodMetrics) (*PodMetrics, error) { |
| 66 | + var errs error |
| 67 | + updated := existing.Clone() |
| 68 | + runningQueueSize, _, err := getLatestMetric(metricFamilies, RunningQueueSizeMetricName) |
| 69 | + multierr.Append(errs, err) |
| 70 | + if err != nil { |
| 71 | + updated.RunningQueueSize = int(runningQueueSize.GetCounter().GetValue()) |
| 72 | + } |
| 73 | + waitingQueueSize, _, err := getLatestMetric(metricFamilies, WaitingQueueSizeMetricName) |
| 74 | + multierr.Append(errs, err) |
| 75 | + if err != nil { |
| 76 | + updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue()) |
| 77 | + } |
| 78 | + cachePercent, _, err := getLatestMetric(metricFamilies, KVCacheUsagePercentMetricName) |
| 79 | + multierr.Append(errs, err) |
| 80 | + if err != nil { |
| 81 | + updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue() |
| 82 | + } |
| 83 | + /* TODO: uncomment once this is available in vllm. |
| 84 | + kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName) |
| 85 | + multierr.Append(errs, err) |
| 86 | + if err != nil { |
| 87 | + updated.KvCacheMaxTokenCapacity = int(kvCap) |
| 88 | + } |
| 89 | + */ |
| 90 | + |
| 91 | + // Update active loras |
| 92 | + mf, ok := metricFamilies[ActiveLoRAAdaptersMetricName] |
| 93 | + if ok { |
| 94 | + // IMPORTANT: replace the map entries instead of appending to it. |
| 95 | + updated.CachedModels = make(map[string]int) |
| 96 | + for _, metric := range mf.GetMetric() { |
| 97 | + for _, label := range metric.GetLabel() { |
| 98 | + if label.GetName() == "active_adapters" { |
| 99 | + if label.GetValue() != "" { |
| 100 | + adapterList := strings.Split(label.GetValue(), ",") |
| 101 | + for _, adapter := range adapterList { |
| 102 | + updated.CachedModels[adapter] = 0 |
| 103 | + } |
| 104 | + } |
| 105 | + } |
| 106 | + } |
| 107 | + } |
| 108 | + } else { |
| 109 | + klog.Warningf("metric family %q not found", ActiveLoRAAdaptersMetricName) |
| 110 | + multierr.Append(errs, fmt.Errorf("metric family %q not found", ActiveLoRAAdaptersMetricName)) |
| 111 | + } |
| 112 | + |
| 113 | + return updated, errs |
| 114 | +} |
| 115 | + |
| 116 | +// getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric. |
| 117 | +func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, time.Time, error) { |
| 118 | + mf, ok := metricFamilies[metricName] |
| 119 | + if !ok { |
| 120 | + klog.Warningf("metric family %q not found", metricName) |
| 121 | + return nil, time.Time{}, fmt.Errorf("metric family %q not found", metricName) |
| 122 | + } |
| 123 | + if len(mf.GetMetric()) == 0 { |
| 124 | + return nil, time.Time{}, fmt.Errorf("no metrics available for %q", metricName) |
| 125 | + } |
| 126 | + var latestTs int64 |
| 127 | + var latest *dto.Metric |
| 128 | + for _, m := range mf.GetMetric() { |
| 129 | + if m.GetTimestampMs() > latestTs { |
| 130 | + latestTs = m.GetTimestampMs() |
| 131 | + latest = m |
| 132 | + } |
| 133 | + } |
| 134 | + return latest, time.Unix(0, latestTs*1000), nil |
| 135 | +} |
0 commit comments