Skip to content

Commit 28e0750

Browse files
authored
[V1] Avoid list creation in input preparation (#12457)
Signed-off-by: Woosuk Kwon <[email protected]>
1 parent 582cf78 commit 28e0750

File tree

1 file changed

+12
-5
lines changed

1 file changed

+12
-5
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,8 @@ def __init__(
171171

172172
# OPTIMIZATION: Cache the tensors rather than creating them every step.
173173
self.arange_np = np.arange(max(self.max_num_reqs + 1,
174-
self.max_model_len),
174+
self.max_model_len,
175+
self.max_num_tokens),
175176
dtype=np.int32)
176177
# NOTE(woosuk): These tensors are "stateless", i.e., they are literally
177178
# a faster version of creating a new tensor every time. Thus, we should
@@ -358,8 +359,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
358359

359360
# Get batched arange.
360361
# E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
361-
arange = np.concatenate(
362-
[self.arange_np[:n] for n in num_scheduled_tokens])
362+
# Equivalent to but faster than:
363+
# np.concatenate([np.arange(n) for n in num_scheduled_tokens])
364+
# Step 1. [2, 5, 3] -> [2, 7, 10]
365+
cu_num_tokens = np.cumsum(num_scheduled_tokens)
366+
# Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
367+
cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens,
368+
num_scheduled_tokens)
369+
# Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
370+
arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets
363371

364372
# Get positions.
365373
positions_np = self.positions_np[:total_num_scheduled_tokens]
@@ -406,8 +414,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
406414

407415
# Prepare the attention metadata.
408416
self.query_start_loc_np[0] = 0
409-
np.cumsum(num_scheduled_tokens,
410-
out=self.query_start_loc_np[1:num_reqs + 1])
417+
self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
411418

412419
self.seq_lens_np[:num_reqs] = (
413420
self.input_batch.num_computed_tokens_cpu[:num_reqs] +

0 commit comments

Comments
 (0)