[V1][Metrics] Account for multi-engine DP configuration

markmc · markmc · commit 9bdf0800933f · 2025-05-14T14:48:58.000-04:00
In the case of DP, we will have a complete set of metrics for
each DP rank.

We could make get_metrics_snapshot() take a DP rank parameter
to avoid this, but it is possible in future we will add further
dimensions that we want to label on.

Signed-off-by: Mark McLoughlin &lt;markmc@redhat.com&gt;
diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py
@@ -59,7 +59,7 @@ class Histogram(Metric):
     """
     count: int
     sum: float
-    buckets: dict[str, float]
+    buckets: dict[str, int]
 
 
 def get_metrics_snapshot() -> list[Metric]:
@@ -83,11 +83,10 @@ def get_metrics_snapshot() -> list[Metric]:
         if not metric.name.startswith("vllm:"):
             continue
         if metric.type == "gauge":
-            sample = _must_get_sample(metric)
-            collected.append(
-                Gauge(name=metric.name,
-                      labels=sample.labels,
-                      value=sample.value))
+            samples = _get_samples(metric)
+            for s in samples:
+                collected.append(
+                    Gauge(name=metric.name, labels=s.labels, value=s.value))
         elif metric.type == "counter":
             samples = _get_samples(metric, "_total")
             if metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
@@ -99,20 +98,16 @@ def get_metrics_snapshot() -> list[Metric]:
                 # accepted tokens using a Counter labeled with 'position'.
                 # We convert these into a vector of integer values.
                 #
-                values: list[int] = [0] * len(samples)
+                for labels, values in _digest_num_accepted_by_pos_samples(
+                        samples):
+                    collected.append(
+                        Vector(name=metric.name, labels=labels, values=values))
+            else:
                 for s in samples:
-                    values[int(s.labels["position"])] = int(s.value)
-                collected.append(
-                    Vector(name=metric.name,
-                           labels=_strip_label(s.labels, "position"),
-                           values=values))
-                continue
-
-            for s in samples:
-                collected.append(
-                    Counter(name=metric.name,
-                            labels=samples[0].labels,
-                            value=int(samples[0].value)))
+                    collected.append(
+                        Counter(name=metric.name,
+                                labels=s.labels,
+                                value=int(s.value)))
 
         elif metric.type == "histogram":
             #
@@ -122,17 +117,17 @@ def get_metrics_snapshot() -> list[Metric]:
             # indexed by the value of the 'le' label. The 'le=+Inf'
             # label is a special case, catching all values observed.
             #
-            count_sample = int(_must_get_sample(metric, "_count").value)
-            sum_sample = _must_get_sample(metric, "_sum").value
-            buckets: dict[str, float] = dict()
-            for s in _get_samples(metric, "_bucket"):
-                buckets[s.labels["le"]] = s.value
-            collected.append(
-                Histogram(name=metric.name,
-                          labels=_strip_label(s.labels, "le"),
-                          buckets=buckets,
-                          count=count_sample,
-                          sum=sum_sample))
+            bucket_samples = _get_samples(metric, "_bucket")
+            count_samples = _get_samples(metric, "_count")
+            sum_samples = _get_samples(metric, "_sum")
+            for labels, buckets, count_value, sum_value in _digest_histogram(
+                    bucket_samples, count_samples, sum_samples):
+                collected.append(
+                    Histogram(name=metric.name,
+                              labels=labels,
+                              buckets=buckets,
+                              count=count_value,
+                              sum=sum_value))
         else:
             raise AssertionError(f"Unknown metric type {metric.type}")
 
@@ -145,14 +140,106 @@ def _get_samples(metric: PromMetric,
     return [s for s in metric.samples if s.name == name]
 
 
-def _must_get_sample(metric: PromMetric,
-                     suffix: Optional[str] = None) -> Sample:
-    samples = _get_samples(metric, suffix)
-    assert len(samples) == 1
-    return samples[0]
-
-
 def _strip_label(labels: dict[str, str], key_to_remove: str) -> dict[str, str]:
     labels_copy = labels.copy()
     labels_copy.pop(key_to_remove)
     return labels_copy
+
+
+def _digest_histogram(
+    bucket_samples: list[Sample], count_samples: list[Sample],
+    sum_samples: list[Sample]
+) -> list[tuple[dict[str, str], dict[str, int], int, float]]:
+    #
+    # In the case of DP, we have an indigestable
+    # per-bucket-per-engine count as a list of labelled
+    # samples, along with total and sum samples
+    #
+    # bucket_samples (in):
+    #   labels = {bucket: 100, idx: 0}, value = 2
+    #   labels = {bucket: 200, idx: 0}, value = 4
+    #   labels = {bucket: Inf, idx: 0}, value = 10
+    #   labels = {bucket: 100, idx: 1}, value = 1
+    #   labels = {bucket: 200, idx: 2}, value = 5
+    #   labels = {bucket: Inf, idx: 3}, value = 7
+    # count_samples (in):
+    #   labels = {idx: 0}, value = 10
+    #   labels = {idx: 1}, value = 7
+    # sum_samples (in):
+    #   labels = {idx: 0}, value = 2000
+    #   labels = {idx: 1}, value = 1200
+    #
+    # output: [
+    #   {idx: 0}, {"100": 2, "200": 4, "Inf": 10}, 10, 2000
+    #   {idx: 1}, {"100": 1, "200": 5, "Inf": 7},   7, 1200
+    # ]
+    buckets_by_labels: dict[frozenset[tuple[str, str]], dict[str, int]] = {}
+    for s in bucket_samples:
+        bucket = s.labels["le"]
+        labels_key = frozenset(_strip_label(s.labels, "le").items())
+        if labels_key not in buckets_by_labels:
+            buckets_by_labels[labels_key] = {}
+        buckets_by_labels[labels_key][bucket] = int(s.value)
+
+    counts_by_labels: dict[frozenset[tuple[str, str]], int] = {}
+    for s in count_samples:
+        labels_key = frozenset(s.labels.items())
+        counts_by_labels[labels_key] = int(s.value)
+
+    sums_by_labels: dict[frozenset[tuple[str, str]], float] = {}
+    for s in sum_samples:
+        labels_key = frozenset(s.labels.items())
+        sums_by_labels[labels_key] = s.value
+
+    assert set(buckets_by_labels.keys()) == set(
+        counts_by_labels.keys()) == set(sums_by_labels.keys())
+
+    output = []
+    label_keys = list(buckets_by_labels.keys())
+    for k in label_keys:
+        labels = dict(k)
+        output.append((labels, buckets_by_labels[k], counts_by_labels[k],
+                       sums_by_labels[k]))
+    return output
+
+
+def _digest_num_accepted_by_pos_samples(
+        samples: list[Sample]) -> list[tuple[dict[str, str], list[int]]]:
+    #
+    # In the case of DP, we have an indigestable
+    # per-position-per-engine count as a list of
+    # labelled samples
+    #
+    # samples (in):
+    #   labels = {pos: 0, idx: 0}, value = 10
+    #   labels = {pos: 1, idx: 0}, value = 7
+    #   labels = {pos: 2, idx: 0}, value = 2
+    #   labels = {pos: 0, idx: 1}, value = 5
+    #   labels = {pos: 1, idx: 1}, value = 3
+    #   labels = {pos: 2, idx: 1}, value = 1
+    #
+    # output: [
+    #   {idx: 0}, [10, 7, 2]
+    #   {idx: 1}, [5, 3, 1]
+    # ]
+    #
+    max_pos = 0
+    values_by_labels: dict[frozenset[tuple[str, str]], dict[int, int]] = {}
+
+    for s in samples:
+        position = int(s.labels["position"])
+        max_pos = max(max_pos, position)
+
+        labels_key = frozenset(_strip_label(s.labels, "position").items())
+        if labels_key not in values_by_labels:
+            values_by_labels[labels_key] = {}
+        values_by_labels[labels_key][position] = int(s.value)
+
+    output = []
+    for labels_key, values_by_position in values_by_labels.items():
+        labels = dict(labels_key)
+        values = [0] * (max_pos + 1)
+        for pos, val in values_by_position.items():
+            values[pos] = val
+        output.append((labels, values))
+    return output