@@ -1279,11 +1279,22 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
1279
1279
self .enable_chunked_prefill = True
1280
1280
# When no user override, set the default values based on the usage
1281
1281
# context.
1282
- # TODO(woosuk): Tune the default values for different hardware.
1283
- default_max_num_batched_tokens = {
1284
- UsageContext .LLM_CLASS : 8192 ,
1285
- UsageContext .OPENAI_API_SERVER : 2048 ,
1286
- }
1282
+ # Use different default values for different hardware.
1283
+ from vllm .platforms import current_platform
1284
+ device_name = current_platform .get_device_name ().lower ()
1285
+ if "h100" in device_name or "h200" in device_name :
1286
+ # For H100 and H200, we use larger default values.
1287
+ default_max_num_batched_tokens = {
1288
+ UsageContext .LLM_CLASS : 16384 ,
1289
+ UsageContext .OPENAI_API_SERVER : 8192 ,
1290
+ }
1291
+ else :
1292
+ # TODO(woosuk): Tune the default values for other hardware.
1293
+ default_max_num_batched_tokens = {
1294
+ UsageContext .LLM_CLASS : 8192 ,
1295
+ UsageContext .OPENAI_API_SERVER : 2048 ,
1296
+ }
1297
+
1287
1298
if (self .max_num_batched_tokens is None
1288
1299
and usage_context in default_max_num_batched_tokens ):
1289
1300
self .max_num_batched_tokens = default_max_num_batched_tokens [
0 commit comments