[Bugfix] Enable V1 usage stats (vllm-project#16986)

mgoin · njhill · liuzijing2014 · commit 3c717800347d · 2025-04-25T14:11:58.000-07:00
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
Signed-off-by: Nick Hill &lt;nhill@redhat.com&gt;
Co-authored-by: Nick Hill &lt;nhill@redhat.com&gt;
Signed-off-by: Zijing Liu &lt;liuzijing2014@gmail.com&gt;
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
@@ -19,6 +19,7 @@
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
+from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties
 from vllm.version import __version__ as VLLM_VERSION
 
 _config_home = envs.VLLM_CONFIG_ROOT
@@ -168,10 +169,9 @@ def _report_usage_once(self, model_architecture: str,
         # Platform information
         from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
-            device_property = torch.cuda.get_device_properties(0)
-            self.gpu_count = torch.cuda.device_count()
-            self.gpu_type = device_property.name
-            self.gpu_memory_per_device = device_property.total_memory
+            self.gpu_count = cuda_device_count_stateless()
+            self.gpu_type, self.gpu_memory_per_device = (
+                cuda_get_device_properties(0, ("name", "total_memory")))
         if current_platform.is_cuda():
             self.cuda_runtime = torch.version.cuda
         self.provider = _detect_cloud_provider()
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -38,11 +38,13 @@
 from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
                              Iterable, Iterator, KeysView, Mapping)
+from concurrent.futures.process import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Tuple, Type, TypeVar, Union, cast, overload)
+                    Optional, Sequence, Tuple, Type, TypeVar, Union, cast,
+                    overload)
 from uuid import uuid4
 
 import cachetools
@@ -1235,6 +1237,22 @@ def cuda_is_initialized() -> bool:
     return torch.cuda.is_initialized()
 
 
+def cuda_get_device_properties(device,
+                               names: Sequence[str],
+                               init_cuda=False) -> tuple[Any, ...]:
+    """Get specified CUDA device property values without initializing CUDA in
+    the current process."""
+    if init_cuda or cuda_is_initialized():
+        props = torch.cuda.get_device_properties(device)
+        return tuple(getattr(props, name) for name in names)
+
+    # Run in subprocess to avoid initializing CUDA as a side effect.
+    mp_ctx = multiprocessing.get_context("fork")
+    with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
+        return executor.submit(cuda_get_device_properties, device, names,
+                               True).result()
+
+
 def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]:
     """Make an instance method that weakly references
     its associated instance and no-ops once that
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -35,6 +35,7 @@
 from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory,
                                      setup_default_loggers)
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.utils import report_usage_stats
 
 logger = init_logger(__name__)
 
@@ -131,6 +132,9 @@ def __init__(
         except RuntimeError:
             pass
 
+        # If usage stat is enabled, collect relevant info.
+        report_usage_stats(vllm_config, usage_context)
+
     @classmethod
     def from_vllm_config(
         cls,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
@@ -4,9 +4,9 @@
 from copy import copy
 from typing import Any, Callable, Optional, Union
 
-from typing_extensions import TypeVar
-
 import vllm.envs as envs
+
+from typing_extensions import TypeVar
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.engine.arg_utils import EngineArgs
@@ -19,7 +19,9 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import (
-    BaseTokenizerGroup, init_tokenizer_from_configs)
+    BaseTokenizerGroup,
+    init_tokenizer_from_configs,
+)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device
 from vllm.v1.engine.core_client import EngineCoreClient
@@ -28,6 +30,7 @@
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import StatLoggerFactory
+from vllm.v1.utils import report_usage_stats
 
 logger = init_logger(__name__)
 
@@ -54,12 +57,14 @@ def __init__(
                 "Using V1 LLMEngine, but envs.VLLM_USE_V1=False. "
                 "This should not happen. As a workaround, try using "
                 "LLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+                "VLLM_USE_V1=0 or 1 and report this issue on Github."
+            )
 
         if stat_loggers is not None:
             raise NotImplementedError(
                 "Passing StatLoggers to LLMEngine in V1 is not yet supported. "
-                "Set VLLM_USE_V1=0 and file and issue on Github.")
+                "Set VLLM_USE_V1=0 and file and issue on Github."
+            )
 
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -79,17 +84,17 @@ def __init__(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
-            lora_config=vllm_config.lora_config)
+            lora_config=vllm_config.lora_config,
+        )
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
-        self.processor = Processor(vllm_config=vllm_config,
-                                   tokenizer=self.tokenizer,
-                                   mm_registry=mm_registry)
+        self.processor = Processor(
+            vllm_config=vllm_config, tokenizer=self.tokenizer, mm_registry=mm_registry
+        )
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
-        self.output_processor = OutputProcessor(self.tokenizer,
-                                                log_stats=False)
+        self.output_processor = OutputProcessor(self.tokenizer, log_stats=False)
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
@@ -104,6 +109,9 @@ def __init__(
             # for v0 compatibility
             self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
 
+        # If usage stat is enabled, collect relevant info.
+        report_usage_stats(vllm_config, usage_context)
+
     @classmethod
     def from_vllm_config(
         cls,
@@ -112,12 +120,14 @@ def from_vllm_config(
         stat_loggers: Optional[list[StatLoggerFactory]] = None,
         disable_log_stats: bool = False,
     ) -> "LLMEngine":
-        return cls(vllm_config=vllm_config,
-                   executor_class=Executor.get_class(vllm_config),
-                   log_stats=(not disable_log_stats),
-                   usage_context=usage_context,
-                   stat_loggers=stat_loggers,
-                   multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING)
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=Executor.get_class(vllm_config),
+            log_stats=(not disable_log_stats),
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING,
+        )
 
     @classmethod
     def from_engine_args(
@@ -138,12 +148,14 @@ def from_engine_args(
             enable_multiprocessing = True
 
         # Create the LLMEngine.
-        return cls(vllm_config=vllm_config,
-                   executor_class=executor_class,
-                   log_stats=not engine_args.disable_log_stats,
-                   usage_context=usage_context,
-                   stat_loggers=stat_loggers,
-                   multiprocess_mode=enable_multiprocessing)
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=not engine_args.disable_log_stats,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+            multiprocess_mode=enable_multiprocessing,
+        )
 
     def get_num_unfinished_requests(self) -> int:
         return self.output_processor.get_num_unfinished_requests()
@@ -156,7 +168,8 @@ def has_unfinished_requests(self) -> bool:
 
     def has_unfinished_requests_dp(self, has_unfinished: bool) -> bool:
         aggregated_has_unfinished = ParallelConfig.has_unfinished_dp(
-            self.dp_group, has_unfinished)
+            self.dp_group, has_unfinished
+        )
         if not has_unfinished and aggregated_has_unfinished:
             self.should_execute_dummy_batch = True
         return aggregated_has_unfinished
@@ -183,11 +196,16 @@ def add_request(
         priority: int = 0,
     ) -> None:
         # Process raw inputs into the request.
-        request = self.processor.process_inputs(request_id, prompt, params,
-                                                arrival_time, lora_request,
-                                                trace_headers,
-                                                prompt_adapter_request,
-                                                priority)
+        request = self.processor.process_inputs(
+            request_id,
+            prompt,
+            params,
+            arrival_time,
+            lora_request,
+            trace_headers,
+            prompt_adapter_request,
+            priority,
+        )
 
         n = params.n if isinstance(params, SamplingParams) else 1
 
@@ -222,8 +240,7 @@ def step(self) -> list[RequestOutput]:
         outputs = self.engine_core.get_output()
 
         # 2) Process EngineCoreOutputs.
-        processed_outputs = self.output_processor.process_outputs(
-            outputs.outputs)
+        processed_outputs = self.output_processor.process_outputs(outputs.outputs)
 
         # 3) Abort any reqs that finished due to stop strings.
         self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
@@ -261,12 +278,15 @@ def get_tokenizer_group(
         tokenizer_group = self.tokenizer
 
         if tokenizer_group is None:
-            raise ValueError("Unable to get tokenizer because "
-                             "skip_tokenizer_init is True")
+            raise ValueError(
+                "Unable to get tokenizer because " "skip_tokenizer_init is True"
+            )
         if not isinstance(tokenizer_group, group_type):
-            raise TypeError("Invalid type of tokenizer group. "
-                            f"Expected type: {group_type}, but "
-                            f"found type: {type(tokenizer_group)}")
+            raise TypeError(
+                "Invalid type of tokenizer group. "
+                f"Expected type: {group_type}, but "
+                f"found type: {type(tokenizer_group)}"
+            )
 
         return tokenizer_group
 
@@ -286,11 +306,13 @@ def pin_lora(self, lora_id: int) -> bool:
         """Prevent an adapter from being evicted."""
         return self.engine_core.pin_lora(lora_id)
 
-    def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+    def collective_rpc(
+        self,
+        method: Union[str, Callable[..., _R]],
+        timeout: Optional[float] = None,
+        args: tuple = (),
+        kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[_R]:
         return self.engine_core.collective_rpc(method, timeout, args, kwargs)
 
     def __del__(self):
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
@@ -12,6 +12,8 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.models.utils import extract_layer_index
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
+                                  usage_message)
 from vllm.utils import get_mp_context, kill_process_tree
 
 if TYPE_CHECKING:
@@ -201,3 +203,45 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
     Returns the sliced target tensor.
     """
     return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
+
+
+def report_usage_stats(vllm_config, usage_context: UsageContext) -> None:
+    """Report usage statistics if enabled."""
+
+    if not is_usage_stats_enabled():
+        return
+
+    from vllm.model_executor.model_loader import get_architecture_class_name
+
+    usage_message.report_usage(
+        get_architecture_class_name(vllm_config.model_config),
+        usage_context,
+        extra_kvs={
+            # Common configuration
+            "dtype":
+            str(vllm_config.model_config.dtype),
+            "tensor_parallel_size":
+            vllm_config.parallel_config.tensor_parallel_size,
+            "block_size":
+            vllm_config.cache_config.block_size,
+            "gpu_memory_utilization":
+            vllm_config.cache_config.gpu_memory_utilization,
+
+            # Quantization
+            "quantization":
+            vllm_config.model_config.quantization,
+            "kv_cache_dtype":
+            str(vllm_config.cache_config.cache_dtype),
+
+            # Feature flags
+            "enable_lora":
+            bool(vllm_config.lora_config),
+            "enable_prompt_adapter":
+            bool(vllm_config.prompt_adapter_config),
+            "enable_prefix_caching":
+            vllm_config.cache_config.enable_prefix_caching,
+            "enforce_eager":
+            vllm_config.model_config.enforce_eager,
+            "disable_custom_all_reduce":
+            vllm_config.parallel_config.disable_custom_all_reduce,
+        })