@@ -171,7 +171,8 @@ def __init__(
171
171
172
172
# OPTIMIZATION: Cache the tensors rather than creating them every step.
173
173
self .arange_np = np .arange (max (self .max_num_reqs + 1 ,
174
- self .max_model_len ),
174
+ self .max_model_len ,
175
+ self .max_num_tokens ),
175
176
dtype = np .int32 )
176
177
# NOTE(woosuk): These tensors are "stateless", i.e., they are literally
177
178
# a faster version of creating a new tensor every time. Thus, we should
@@ -358,8 +359,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
358
359
359
360
# Get batched arange.
360
361
# E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
361
- arange = np .concatenate (
362
- [self .arange_np [:n ] for n in num_scheduled_tokens ])
362
+ # Equivalent to but faster than:
363
+ # np.concatenate([np.arange(n) for n in num_scheduled_tokens])
364
+ # Step 1. [2, 5, 3] -> [2, 7, 10]
365
+ cu_num_tokens = np .cumsum (num_scheduled_tokens )
366
+ # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
367
+ cumsums_offsets = np .repeat (cu_num_tokens - num_scheduled_tokens ,
368
+ num_scheduled_tokens )
369
+ # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
370
+ arange = self .arange_np [:total_num_scheduled_tokens ] - cumsums_offsets
363
371
364
372
# Get positions.
365
373
positions_np = self .positions_np [:total_num_scheduled_tokens ]
@@ -406,8 +414,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
406
414
407
415
# Prepare the attention metadata.
408
416
self .query_start_loc_np [0 ] = 0
409
- np .cumsum (num_scheduled_tokens ,
410
- out = self .query_start_loc_np [1 :num_reqs + 1 ])
417
+ self .query_start_loc_np [1 :num_reqs + 1 ] = cu_num_tokens
411
418
412
419
self .seq_lens_np [:num_reqs ] = (
413
420
self .input_batch .num_computed_tokens_cpu [:num_reqs ] +
0 commit comments