@@ -241,17 +241,18 @@ def __init__(
241
241
device = self .device )
242
242
243
243
# OPTIMIZATION: Cache the tensors rather than creating them every step.
244
-
245
- # For long context, we may need to store using int64 so max token idx doesn't overflow
246
- # token_indices is calculated by adding (req_idx * max_model_len) to per-request token indices
247
- # e.g. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
244
+ # For long context, may need to store int64 so max idx doesn't overflow
245
+ # token_indices calculated by adding (req_idx * max_model_len)
246
+ # to per-request indices e.g. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
248
247
# -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
249
248
# where M is the max_model_len.
250
- max_token_idx = self .max_num_tokens + self .max_num_reqs * self .max_model_len
249
+ max_token_idx = self .max_num_tokens + self .max_num_reqs * \
250
+ self .max_model_len
251
251
self .arange_np = np .arange (max (self .max_num_reqs + 1 ,
252
252
self .max_model_len ,
253
253
self .max_num_tokens ),
254
- dtype = np .int32 if max_token_idx <= np .iinfo (np .int32 ).max else np .int64 )
254
+ dtype = np .int32 if max_token_idx <= np .iinfo (
255
+ np .int32 ).max else np .int64 )
255
256
256
257
# NOTE(woosuk): These tensors are "stateless", i.e., they are literally
257
258
# a faster version of creating a new tensor every time. Thus, we should
0 commit comments