Skip to content

Commit e4c7bf9

Browse files
Merge branch 'vllm-project:main' into main
2 parents 9e36cf1 + 24b7fb4 commit e4c7bf9

File tree

114 files changed

+7236
-1093
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

114 files changed

+7236
-1093
lines changed

.pre-commit-config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
default_install_hook_types:
2+
- pre-commit
3+
- commit-msg
14
default_stages:
25
- pre-commit # Run locally
36
- manual # Run in CI

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
242242
"csrc/quantization/gguf/gguf_kernel.cu"
243243
"csrc/cuda_utils_kernels.cu"
244244
"csrc/prepare_inputs/advance_step.cu"
245+
"csrc/custom_all_reduce.cu"
245246
"csrc/torch_bindings.cpp")
246247

247248
if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -283,7 +284,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
283284
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
284285
"csrc/quantization/aqlm/gemm_kernels.cu"
285286
"csrc/quantization/awq/gemm_kernels.cu"
286-
"csrc/custom_all_reduce.cu"
287287
"csrc/permute_cols.cu"
288288
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
289289
"csrc/quantization/fp4/nvfp4_quant_entry.cu"

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,12 @@ Easy, fast, and cheap LLM serving for everyone
1515

1616
---
1717

18-
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
19-
2018
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
2119

2220
---
2321

2422
*Latest News* 🔥
25-
23+
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
2624
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
2725
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
2826
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
@@ -126,6 +124,7 @@ Compute Resources:
126124
- Databricks
127125
- DeepInfra
128126
- Google Cloud
127+
- Intel
129128
- Lambda Lab
130129
- Nebius
131130
- Novita AI

benchmarks/kernels/benchmark_moe.py

Lines changed: 61 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,18 @@ class BenchmarkConfig(TypedDict):
3030
num_stages: int
3131

3232

33-
def benchmark_config(
34-
config: BenchmarkConfig,
35-
num_tokens: int,
36-
num_experts: int,
37-
shard_intermediate_size: int,
38-
hidden_size: int,
39-
topk: int,
40-
dtype: torch.dtype,
41-
use_fp8_w8a8: bool,
42-
use_int8_w8a16: bool,
43-
num_iters: int = 100,
44-
block_quant_shape: List[int] = None,
45-
) -> float:
33+
def benchmark_config(config: BenchmarkConfig,
34+
num_tokens: int,
35+
num_experts: int,
36+
shard_intermediate_size: int,
37+
hidden_size: int,
38+
topk: int,
39+
dtype: torch.dtype,
40+
use_fp8_w8a8: bool,
41+
use_int8_w8a16: bool,
42+
num_iters: int = 100,
43+
block_quant_shape: List[int] = None,
44+
use_deep_gemm: bool = False) -> float:
4645
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
4746
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
4847
if use_int8_w8a16:
@@ -115,22 +114,41 @@ def prepare(i: int):
115114
def run():
116115
from vllm.model_executor.layers.fused_moe import override_config
117116
with override_config(config):
118-
fused_moe(
119-
x,
120-
w1,
121-
w2,
122-
input_gating,
123-
topk,
124-
renormalize=True,
125-
inplace=True,
126-
use_fp8_w8a8=use_fp8_w8a8,
127-
use_int8_w8a16=use_int8_w8a16,
128-
w1_scale=w1_scale,
129-
w2_scale=w2_scale,
130-
a1_scale=a1_scale,
131-
a2_scale=a2_scale,
132-
block_shape=block_quant_shape,
133-
)
117+
if use_deep_gemm:
118+
topk_weights, topk_ids = fused_topk(x, input_gating, topk,
119+
False)
120+
return fused_experts(
121+
x,
122+
w1,
123+
w2,
124+
topk_weights,
125+
topk_ids,
126+
inplace=True,
127+
use_fp8_w8a8=use_fp8_w8a8,
128+
w1_scale=w1_scale,
129+
w2_scale=w2_scale,
130+
a1_scale=a1_scale,
131+
a2_scale=a2_scale,
132+
block_shape=block_quant_shape,
133+
allow_deep_gemm=True,
134+
)
135+
else:
136+
fused_moe(
137+
x,
138+
w1,
139+
w2,
140+
input_gating,
141+
topk,
142+
renormalize=True,
143+
inplace=True,
144+
use_fp8_w8a8=use_fp8_w8a8,
145+
use_int8_w8a16=use_int8_w8a16,
146+
w1_scale=w1_scale,
147+
w2_scale=w2_scale,
148+
a1_scale=a1_scale,
149+
a2_scale=a2_scale,
150+
block_shape=block_quant_shape,
151+
)
134152

135153
# JIT compilation & warmup
136154
run()
@@ -366,6 +384,7 @@ def benchmark(
366384
use_fp8_w8a8: bool,
367385
use_int8_w8a16: bool,
368386
block_quant_shape: List[int] = None,
387+
use_deep_gemm: bool = False,
369388
) -> tuple[dict[str, int], float]:
370389
current_platform.seed_everything(self.seed)
371390
dtype_str = get_config_dtype_str(dtype,
@@ -396,7 +415,8 @@ def benchmark(
396415
use_fp8_w8a8,
397416
use_int8_w8a16,
398417
num_iters=100,
399-
block_quant_shape=block_quant_shape)
418+
block_quant_shape=block_quant_shape,
419+
use_deep_gemm=use_deep_gemm)
400420
return config, kernel_time
401421

402422
def tune(
@@ -411,6 +431,7 @@ def tune(
411431
use_int8_w8a16: bool,
412432
search_space: list[dict[str, int]],
413433
block_quant_shape: list[int],
434+
use_deep_gemm: bool,
414435
) -> dict[str, int]:
415436
best_config = None
416437
best_time = float("inf")
@@ -436,7 +457,8 @@ def tune(
436457
use_fp8_w8a8,
437458
use_int8_w8a16,
438459
num_iters=20,
439-
block_quant_shape=block_quant_shape)
460+
block_quant_shape=block_quant_shape,
461+
use_deep_gemm=use_deep_gemm)
440462
except triton.runtime.autotuner.OutOfResources:
441463
# Some configurations may be invalid and fail to compile.
442464
continue
@@ -550,6 +572,8 @@ def main(args: argparse.Namespace):
550572
else:
551573
batch_sizes = [args.batch_size]
552574

575+
use_deep_gemm = bool(args.use_deep_gemm)
576+
553577
ray.init()
554578
num_gpus = int(ray.available_resources()["GPU"])
555579
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@@ -572,10 +596,10 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
572596

573597
start = time.time()
574598
configs = _distribute(
575-
"tune",
576-
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
577-
use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
578-
for batch_size in batch_sizes])
599+
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
600+
topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
601+
block_quant_shape, use_deep_gemm)
602+
for batch_size in batch_sizes])
579603
best_configs = {
580604
M: sort_config(config)
581605
for M, config in zip(batch_sizes, configs)
@@ -589,7 +613,7 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
589613
outputs = _distribute(
590614
"benchmark",
591615
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
592-
use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
616+
use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
593617
for batch_size in batch_sizes])
594618

595619
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@@ -611,6 +635,7 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
611635
type=str,
612636
choices=["auto", "fp8_w8a8", "int8_w8a16"],
613637
default="auto")
638+
parser.add_argument("--use-deep-gemm", action="store_true")
614639
parser.add_argument("--seed", type=int, default=0)
615640
parser.add_argument("--batch-size", type=int, required=False)
616641
parser.add_argument("--tune", action="store_true")

collect_env.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -482,16 +482,28 @@ def get_pip_packages(run_lambda, patterns=None):
482482
if patterns is None:
483483
patterns = DEFAULT_PIP_PATTERNS
484484

485-
# People generally have `pip` as `pip` or `pip3`
486-
# But here it is invoked as `python -mpip`
487-
def run_with_pip(pip):
488-
out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
485+
def run_with_pip():
486+
try:
487+
import importlib.util
488+
pip_spec = importlib.util.find_spec('pip')
489+
pip_available = pip_spec is not None
490+
except ImportError:
491+
pip_available = False
492+
493+
if pip_available:
494+
cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
495+
elif os.environ.get("UV") is not None:
496+
print("uv is set")
497+
cmd = ["uv", "pip", "list", "--format=freeze"]
498+
else:
499+
raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
500+
501+
out = run_and_read_all(run_lambda, cmd)
489502
return "\n".join(line for line in out.splitlines()
490503
if any(name in line for name in patterns))
491504

492505
pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
493-
out = run_with_pip([sys.executable, '-mpip'])
494-
506+
out = run_with_pip()
495507
return pip_version, out
496508

497509

csrc/custom_all_reduce.cu

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));
1212

1313
fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
1414
torch::Tensor& rank_data, int64_t rank,
15-
bool full_nvlink) {
15+
bool fully_connected) {
1616
int world_size = fake_ipc_ptrs.size();
1717
if (world_size > 8)
1818
throw std::invalid_argument("world size > 8 is not supported");
@@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
2727
}
2828
return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
2929
rank_data.numel(), rank, world_size,
30-
full_nvlink);
30+
fully_connected);
3131
}
3232

3333
/**
@@ -142,3 +142,48 @@ void register_graph_buffers(fptr_t _fa,
142142
bytes.reserve(handles.size());
143143
fa->register_graph_buffers(bytes, offsets);
144144
}
145+
146+
std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
147+
int64_t size) {
148+
auto device_index = c10::cuda::current_device();
149+
at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
150+
void* buffer;
151+
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
152+
auto stream = c10::cuda::getCurrentCUDAStream().stream();
153+
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
154+
155+
// Allocate buffer
156+
#if defined(USE_ROCM)
157+
// data buffers need to be "uncached" for signal on MI200
158+
AT_CUDA_CHECK(
159+
hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
160+
#else
161+
AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
162+
#endif
163+
AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
164+
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
165+
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
166+
167+
// Create IPC memhandle for the allocated buffer.
168+
// Will use it in open_mem_handle.
169+
auto options =
170+
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
171+
auto handle =
172+
torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
173+
AT_CUDA_CHECK(
174+
cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
175+
176+
return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
177+
}
178+
179+
fptr_t open_mem_handle(torch::Tensor& mem_handle) {
180+
void* ipc_ptr;
181+
AT_CUDA_CHECK(cudaIpcOpenMemHandle(
182+
(void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
183+
cudaIpcMemLazyEnablePeerAccess));
184+
return reinterpret_cast<fptr_t>(ipc_ptr);
185+
}
186+
187+
void free_shared_buffer(fptr_t buffer) {
188+
AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
189+
}

0 commit comments

Comments
 (0)