We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 59d6bb4 commit 42bb201Copy full SHA for 42bb201
vllm/v1/worker/gpu_input_batch.py
@@ -57,11 +57,13 @@ def __init__(
57
58
# TODO(woosuk): This buffer could be too large if max_model_len is big.
59
# Find a way to reduce the CPU memory usage.
60
+ # This buffer is not directly transferred to the GPU, so it does not
61
+ # need to be pinned.
62
self.token_ids_cpu_tensor = torch.zeros(
63
(max_num_reqs, max_model_len),
64
device="cpu",
65
dtype=torch.int32,
- pin_memory=pin_memory,
66
+ pin_memory=False,
67
)
68
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
69
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
0 commit comments