File tree 1 file changed +5
-5
lines changed
1 file changed +5
-5
lines changed Original file line number Diff line number Diff line change 35
35
from vllm .test_utils import MODEL_WEIGHTS_S3_BUCKET , MODELS_ON_S3
36
36
from vllm .transformers_utils .utils import check_gguf_file
37
37
from vllm .usage .usage_lib import UsageContext
38
- from vllm .utils import FlexibleArgumentParser , is_in_ray_actor
38
+ from vllm .utils import FlexibleArgumentParser , GiB_bytes , is_in_ray_actor
39
39
40
40
# yapf: enable
41
41
@@ -1625,13 +1625,13 @@ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
1625
1625
# values for non-H100/H200 GPUs.
1626
1626
try :
1627
1627
from vllm .platforms import current_platform
1628
- device_name = current_platform .get_device_name (). lower ()
1628
+ device_memory = current_platform .get_device_total_memory ()
1629
1629
except Exception :
1630
1630
# This is only used to set default_max_num_batched_tokens
1631
- device_name = "no-device"
1631
+ device_memory = 0
1632
1632
1633
- if "h100" in device_name or "h200" in device_name :
1634
- # For H100 and H200, we use larger default values.
1633
+ if device_memory >= 70 * GiB_bytes :
1634
+ # For GPUs like H100 and MI300x, use larger default values.
1635
1635
default_max_num_batched_tokens = {
1636
1636
UsageContext .LLM_CLASS : 16384 ,
1637
1637
UsageContext .OPENAI_API_SERVER : 8192 ,
You can’t perform that action at this time.
0 commit comments