File tree Expand file tree Collapse file tree 1 file changed +3
-6
lines changed Expand file tree Collapse file tree 1 file changed +3
-6
lines changed Original file line number Diff line number Diff line change @@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
456
456
@pytest .mark .parametrize (
457
457
"common_llm_kwargs" ,
458
458
[{
459
- "block_size" : 8 ,
459
+ "block_size" : 16 ,
460
460
# 2 for small prompt, 256//8 for generated.
461
461
"num_gpu_blocks_override" : 2 + 256 // 8 ,
462
462
"max_model_len" : (2 + 256 // 8 ) * 8 ,
@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
526
526
@pytest .mark .parametrize (
527
527
"per_test_common_llm_kwargs" ,
528
528
[
529
- # As of this writing, vLLM only compiles with these 3 block sizes by
530
- # default.
531
- {
532
- "block_size" : 8 ,
533
- },
529
+ # https://github.com/triton-lang/triton/issues/2266 tl.dot
530
+ # doesn't support embedding < 16
534
531
{
535
532
"block_size" : 16 ,
536
533
},
You can’t perform that action at this time.
0 commit comments