[V1][Metrics] Add API for accessing in-memory Prometheus metrics

markmc · markmc · commit 86757acf49a4 · 2025-04-22T17:33:40.000-04:00
Signed-off-by: Mark McLoughlin &lt;markmc@redhat.com&gt;
diff --git a/examples/offline_inference/metrics.py b/examples/offline_inference/metrics.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", disable_log_stats=False)
+
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Dump all metrics
+    metrics = llm.get_metrics()
+    for metric in metrics.list_metrics():
+        metric_type = metrics.get_type(metric)
+        if metric_type in ("counter", "gauge"):
+            print(f"{metric} ({metric_type}) = {metrics.get_value(metric)}")
+        else:
+            assert metric_type == "histogram"
+            print(f"{metric} ({metric_type})")
+            print(f"    sum = {metrics.get_histogram_sum(metric)}")
+            print(f"    count = {metrics.get_histogram_count(metric)}")
+            for bucket_le in metrics.get_histogram_buckets(metric):
+                print(
+                    f"    {bucket_le} = {metrics.get_value(metric, bucket_le)}"
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
@@ -97,3 +97,48 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
             raise AssertionError(
                 f"{len(completion_counts)} unique completions; expected"
                 f" {n}. Repeats: {repeats}")
+
+
+def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
+    max_tokens = 100
+    with vllm_runner(
+            MODEL,
+            disable_log_stats=False,
+    ) as vllm_model:
+        model: LLM = vllm_model.model
+        sampling_params = SamplingParams(temperature=0.0,
+                                         max_tokens=max_tokens)
+        outputs = model.generate(example_prompts, sampling_params)
+
+        n_prompts = len(example_prompts)
+        assert len(outputs) == n_prompts
+
+        total_tokens = 0
+        for out in outputs:
+            assert len(out.outputs) == 1
+            total_tokens += len(out.outputs[0].token_ids)
+        assert total_tokens == max_tokens * n_prompts
+
+        snapshot = model.get_metrics()
+        metrics = snapshot.list_metrics()
+
+        assert "vllm:num_requests_running" in metrics
+        assert snapshot.get_type("vllm:num_requests_running") == "gauge"
+        assert snapshot.get_value("vllm:num_requests_running") == 0
+
+        assert "vllm:generation_tokens" in metrics
+        assert snapshot.get_type("vllm:generation_tokens") == "counter"
+        assert snapshot.get_value("vllm:generation_tokens") == total_tokens
+
+        assert "vllm:request_generation_tokens" in metrics
+        assert snapshot.get_type(
+            "vllm:request_generation_tokens") == "histogram"
+        buckets = snapshot.get_histogram_buckets(
+            "vllm:request_generation_tokens")
+        assert buckets[-1] == "+Inf"
+        assert snapshot.get_value("vllm:request_generation_tokens",
+                                  buckets[-1]) == n_prompts
+        assert snapshot.get_histogram_count(
+            "vllm:request_generation_tokens") == n_prompts
+        assert snapshot.get_histogram_sum(
+            "vllm:request_generation_tokens") == total_tokens
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -44,6 +44,7 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
                         is_list_of)
+from vllm.v1.metrics.reader import MetricsSnapshot
 
 logger = init_logger(__name__)
 
@@ -1269,6 +1270,20 @@ def wake_up(self, tags: Optional[list[str]] = None):
         """
         self.llm_engine.wake_up(tags)
 
+    def get_metrics(self) -> MetricsSnapshot:
+        """Return a snapshot of aggregated metrics from Prometheus.
+
+        Returns:
+            A ``MetricSnapshot`` instance capturing the current state
+            of all aggregated metrics from Prometheus.
+
+        Note:
+            This method is only available with the V1 LLM engine.
+        """
+        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+        assert isinstance(self.llm_engine, V1LLMEngine)
+        return self.llm_engine.get_metrics()
+
     # LEGACY
     def _convert_v1_inputs(
         self,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
@@ -10,7 +10,6 @@
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.metrics_types import StatLoggerBase
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -28,6 +27,9 @@
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import PrometheusStatLogger, StatLoggerBase
+from vllm.v1.metrics.reader import MetricsSnapshot
+from vllm.v1.metrics.stats import IterationStats
 
 logger = init_logger(__name__)
 
@@ -60,6 +62,11 @@ def __init__(
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
+        self.log_stats = log_stats
+        self.stat_logger: Optional[StatLoggerBase] = None
+        if self.log_stats:
+            self.stat_logger = PrometheusStatLogger(vllm_config)
+
         # important: init dp group before init the engine_core
         # In the decoupled engine case this is handled in EngineCoreProc.
         parallel_config = vllm_config.parallel_config
@@ -84,15 +91,15 @@ def __init__(
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(self.tokenizer,
-                                                log_stats=False)
+                                                log_stats=self.log_stats)
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
             multiprocess_mode=multiprocess_mode,
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=False,  # FIXME: implement
+            log_stats=self.log_stats,
         )
 
         if not multiprocess_mode:
@@ -222,12 +229,21 @@ def step(self) -> list[RequestOutput]:
         outputs = self.engine_core.get_output()
 
         # 2) Process EngineCoreOutputs.
+        iteration_stats = IterationStats() if self.log_stats else None
         processed_outputs = self.output_processor.process_outputs(
-            outputs.outputs)
+            outputs.outputs,
+            engine_core_timestamp=outputs.timestamp,
+            iteration_stats=iteration_stats)
 
         # 3) Abort any reqs that finished due to stop strings.
         self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
+        # 4) Record stats
+        if self.stat_logger is not None:
+            assert outputs.scheduler_stats is not None
+            self.stat_logger.record(scheduler_stats=outputs.scheduler_stats,
+                                    iteration_stats=iteration_stats)
+
         return processed_outputs.request_outputs
 
     def get_vllm_config(self):
@@ -254,6 +270,10 @@ def wake_up(self, tags: Optional[list[str]] = None):
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
 
+    def get_metrics(self) -> Optional[MetricsSnapshot]:
+        assert self.log_stats, "Stat logging disabled"
+        return MetricsSnapshot()
+
     def get_tokenizer_group(
         self,
         group_type: type[_G] = BaseTokenizerGroup,
diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from collections.abc import Iterable
+from typing import Optional
+
+from prometheus_client import REGISTRY, Metric
+from prometheus_client.samples import Sample
+
+
+class MetricsSnapshot:
+    """An API for accessing in-memory Prometheus metrics.
+
+    Example:
+        >>> metrics = llm.get_metrics()
+        >>> for metric in metrics.list_metrics():
+        ...     if metrics.get_type(metric) in ("counter", "gauge"):
+        ...         print(f"{metric} = {metrics.get_value(metric)}")
+        ...     else:
+        ...         print(f"{metric}")
+        ...         print(f"    sum = {metrics.get_histogram_sum(metric)}")
+        ...         print(f"    count = {metrics.get_histogram_count(metric)}")
+        ...         for bucket_le in metrics.get_histogram_buckets(metric):
+        ...             print(f"    {bucket_le} = "
+        ...                   "{metrics.get_value(metric, bucket_le)}")
+    """
+
+    def __init__(self):
+        self._collected: dict[str, Metric] = {}
+        for metric in REGISTRY.collect():
+            self._collected[metric.name] = metric
+
+    def list_metrics(self) -> list[str]:
+        """Returns a list of all metric names currently available."""
+        return list(self._collected.keys())
+
+    def get_type(self, name: str) -> str:
+        """Return the type of a metric - gauge, counter, or histogram."""
+        return self._collected[name].type
+
+    def get_value(self, name: str, bucket_le: Optional[str] = None) -> float:
+        """Retrieves the value of a specific metric by its name.
+
+        Only supports gauge, counter, and histogram.
+
+        Raises an exception if the metric name is not found, or if it
+        is an unknown type.
+
+        Args:
+            name: The name of the metric.
+            bucket_le: The bucket label for histograms, if applicable.
+        """
+        metric = self._collected[name]
+        if metric.type == "gauge":
+            sample = self._must_get_sample(metric, name)
+        elif metric.type == "counter":
+            sample = self._must_get_sample(metric, name, "_total")
+        elif metric.type == "histogram":
+            assert bucket_le is not None
+            for sample in self._get_samples(metric, name, "_bucket"):
+                if sample.labels["le"] == bucket_le:
+                    break
+            else:
+                raise KeyError(f"No bucket {bucket_le} for {name}")
+        else:
+            raise AssertionError(f"Unknown metric type {metric.type}")
+        assert sample is not None
+        return sample.value
+
+    def get_histogram_buckets(self, name: str) -> list[str]:
+        """Returns the bucket labels for a histogram metric."""
+        histogram = self._must_get_histogram(name)
+        buckets = self._get_samples(histogram, name, "_bucket")
+        return [s.labels["le"] for s in buckets]
+
+    def get_histogram_count(self, name: str) -> float:
+        """Returns the count of samples for a histogram metric."""
+        histogram = self._must_get_histogram(name)
+        return self._must_get_sample(histogram, name, "_count").value
+
+    def get_histogram_sum(self, name: str) -> float:
+        """Returns the sum of samples for a histogram metric."""
+        histogram = self._must_get_histogram(name)
+        return self._must_get_sample(histogram, name, "_sum").value
+
+    #
+    # Helper methods
+    #
+    def _must_get_histogram(self, name: str) -> Metric:
+        metric = self._collected[name]
+        assert metric.type == "histogram"
+        return metric
+
+    @staticmethod
+    def _must_get_sample(metric: Metric,
+                         name: str,
+                         suffix: Optional[str] = None) -> Sample:
+        fullname = (name + suffix) if suffix is not None else name
+        return next(s for s in metric.samples if s.name == fullname)
+
+    @staticmethod
+    def _get_samples(metric: Metric,
+                     name: str,
+                     suffix: Optional[str] = None) -> Iterable[Sample]:
+        fullname = (name + suffix) if suffix is not None else name
+        return (s for s in metric.samples if s.name == fullname)