Skip to content

Commit 44ff33a

Browse files
divakar-amdIsotr0py
authored andcommitted
[ROCm][MoE] MI300 tuned configs Mixtral-8x(7B,22B) | fp16, fp8 (vllm-project#12408)
Signed-off-by: Divakar Verma <[email protected]> Signed-off-by: Isotr0py <[email protected]>
1 parent fb0840d commit 44ff33a

16 files changed

+2260
-148
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_M": 16,
4+
"BLOCK_SIZE_N": 64,
5+
"BLOCK_SIZE_K": 256,
6+
"GROUP_SIZE_M": 1,
7+
"num_warps": 4,
8+
"num_stages": 2,
9+
"waves_per_eu": 0
10+
},
11+
"2": {
12+
"BLOCK_SIZE_M": 16,
13+
"BLOCK_SIZE_N": 16,
14+
"BLOCK_SIZE_K": 256,
15+
"GROUP_SIZE_M": 1,
16+
"num_warps": 2,
17+
"num_stages": 2,
18+
"waves_per_eu": 0
19+
},
20+
"4": {
21+
"BLOCK_SIZE_M": 16,
22+
"BLOCK_SIZE_N": 64,
23+
"BLOCK_SIZE_K": 256,
24+
"GROUP_SIZE_M": 1,
25+
"num_warps": 4,
26+
"num_stages": 2,
27+
"waves_per_eu": 0
28+
},
29+
"8": {
30+
"BLOCK_SIZE_M": 16,
31+
"BLOCK_SIZE_N": 32,
32+
"BLOCK_SIZE_K": 256,
33+
"GROUP_SIZE_M": 1,
34+
"num_warps": 2,
35+
"num_stages": 2,
36+
"waves_per_eu": 0
37+
},
38+
"16": {
39+
"BLOCK_SIZE_M": 16,
40+
"BLOCK_SIZE_N": 64,
41+
"BLOCK_SIZE_K": 256,
42+
"GROUP_SIZE_M": 1,
43+
"num_warps": 2,
44+
"num_stages": 2,
45+
"waves_per_eu": 0
46+
},
47+
"24": {
48+
"BLOCK_SIZE_M": 16,
49+
"BLOCK_SIZE_N": 64,
50+
"BLOCK_SIZE_K": 256,
51+
"GROUP_SIZE_M": 1,
52+
"num_warps": 2,
53+
"num_stages": 2,
54+
"waves_per_eu": 0
55+
},
56+
"32": {
57+
"BLOCK_SIZE_M": 16,
58+
"BLOCK_SIZE_N": 32,
59+
"BLOCK_SIZE_K": 256,
60+
"GROUP_SIZE_M": 4,
61+
"num_warps": 2,
62+
"num_stages": 2,
63+
"waves_per_eu": 0
64+
},
65+
"48": {
66+
"BLOCK_SIZE_M": 16,
67+
"BLOCK_SIZE_N": 64,
68+
"BLOCK_SIZE_K": 128,
69+
"GROUP_SIZE_M": 4,
70+
"num_warps": 4,
71+
"num_stages": 2,
72+
"waves_per_eu": 0
73+
},
74+
"64": {
75+
"BLOCK_SIZE_M": 32,
76+
"BLOCK_SIZE_N": 64,
77+
"BLOCK_SIZE_K": 256,
78+
"GROUP_SIZE_M": 4,
79+
"num_warps": 2,
80+
"num_stages": 2,
81+
"waves_per_eu": 0
82+
},
83+
"96": {
84+
"BLOCK_SIZE_M": 32,
85+
"BLOCK_SIZE_N": 64,
86+
"BLOCK_SIZE_K": 256,
87+
"GROUP_SIZE_M": 1,
88+
"num_warps": 2,
89+
"num_stages": 2,
90+
"waves_per_eu": 0
91+
},
92+
"128": {
93+
"BLOCK_SIZE_M": 64,
94+
"BLOCK_SIZE_N": 128,
95+
"BLOCK_SIZE_K": 256,
96+
"GROUP_SIZE_M": 4,
97+
"num_warps": 8,
98+
"num_stages": 2,
99+
"waves_per_eu": 0
100+
},
101+
"256": {
102+
"BLOCK_SIZE_M": 128,
103+
"BLOCK_SIZE_N": 128,
104+
"BLOCK_SIZE_K": 256,
105+
"GROUP_SIZE_M": 4,
106+
"num_warps": 8,
107+
"num_stages": 2,
108+
"waves_per_eu": 0
109+
},
110+
"512": {
111+
"BLOCK_SIZE_M": 256,
112+
"BLOCK_SIZE_N": 128,
113+
"BLOCK_SIZE_K": 128,
114+
"GROUP_SIZE_M": 4,
115+
"num_warps": 8,
116+
"num_stages": 2,
117+
"waves_per_eu": 0
118+
},
119+
"1024": {
120+
"BLOCK_SIZE_M": 128,
121+
"BLOCK_SIZE_N": 128,
122+
"BLOCK_SIZE_K": 64,
123+
"GROUP_SIZE_M": 1,
124+
"num_warps": 4,
125+
"num_stages": 2,
126+
"waves_per_eu": 0
127+
},
128+
"1536": {
129+
"BLOCK_SIZE_M": 128,
130+
"BLOCK_SIZE_N": 256,
131+
"BLOCK_SIZE_K": 128,
132+
"GROUP_SIZE_M": 1,
133+
"num_warps": 8,
134+
"num_stages": 2,
135+
"waves_per_eu": 0
136+
},
137+
"2048": {
138+
"BLOCK_SIZE_M": 128,
139+
"BLOCK_SIZE_N": 256,
140+
"BLOCK_SIZE_K": 128,
141+
"GROUP_SIZE_M": 1,
142+
"num_warps": 8,
143+
"num_stages": 2,
144+
"waves_per_eu": 0
145+
},
146+
"3072": {
147+
"BLOCK_SIZE_M": 128,
148+
"BLOCK_SIZE_N": 256,
149+
"BLOCK_SIZE_K": 128,
150+
"GROUP_SIZE_M": 1,
151+
"num_warps": 8,
152+
"num_stages": 2,
153+
"waves_per_eu": 0
154+
},
155+
"4096": {
156+
"BLOCK_SIZE_M": 256,
157+
"BLOCK_SIZE_N": 256,
158+
"BLOCK_SIZE_K": 64,
159+
"GROUP_SIZE_M": 1,
160+
"num_warps": 8,
161+
"num_stages": 2,
162+
"waves_per_eu": 0
163+
}
164+
}

vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,43 @@
11
{
22
"1": {
33
"BLOCK_SIZE_M": 16,
4-
"BLOCK_SIZE_N": 32,
4+
"BLOCK_SIZE_N": 16,
55
"BLOCK_SIZE_K": 256,
66
"GROUP_SIZE_M": 1,
77
"num_warps": 2,
88
"num_stages": 2,
99
"waves_per_eu": 0,
1010
"matrix_instr_nonkdim": 16,
11-
"kpack": 1
11+
"kpack": 2
1212
},
1313
"2": {
1414
"BLOCK_SIZE_M": 16,
1515
"BLOCK_SIZE_N": 16,
16-
"BLOCK_SIZE_K": 128,
16+
"BLOCK_SIZE_K": 256,
1717
"GROUP_SIZE_M": 1,
18-
"num_warps": 2,
18+
"num_warps": 4,
1919
"num_stages": 2,
2020
"waves_per_eu": 0,
2121
"matrix_instr_nonkdim": 16,
2222
"kpack": 2
2323
},
2424
"4": {
2525
"BLOCK_SIZE_M": 16,
26-
"BLOCK_SIZE_N": 32,
27-
"BLOCK_SIZE_K": 256,
26+
"BLOCK_SIZE_N": 16,
27+
"BLOCK_SIZE_K": 128,
2828
"GROUP_SIZE_M": 1,
29-
"num_warps": 2,
29+
"num_warps": 1,
3030
"num_stages": 2,
3131
"waves_per_eu": 0,
3232
"matrix_instr_nonkdim": 16,
3333
"kpack": 2
3434
},
3535
"8": {
3636
"BLOCK_SIZE_M": 16,
37-
"BLOCK_SIZE_N": 16,
38-
"BLOCK_SIZE_K": 256,
37+
"BLOCK_SIZE_N": 64,
38+
"BLOCK_SIZE_K": 64,
3939
"GROUP_SIZE_M": 1,
40-
"num_warps": 1,
40+
"num_warps": 2,
4141
"num_stages": 2,
4242
"waves_per_eu": 0,
4343
"matrix_instr_nonkdim": 16,
@@ -48,76 +48,76 @@
4848
"BLOCK_SIZE_N": 16,
4949
"BLOCK_SIZE_K": 256,
5050
"GROUP_SIZE_M": 1,
51-
"num_warps": 4,
51+
"num_warps": 2,
5252
"num_stages": 2,
5353
"waves_per_eu": 0,
5454
"matrix_instr_nonkdim": 16,
5555
"kpack": 2
5656
},
5757
"24": {
5858
"BLOCK_SIZE_M": 16,
59-
"BLOCK_SIZE_N": 32,
60-
"BLOCK_SIZE_K": 64,
59+
"BLOCK_SIZE_N": 16,
60+
"BLOCK_SIZE_K": 256,
6161
"GROUP_SIZE_M": 1,
62-
"num_warps": 1,
62+
"num_warps": 2,
6363
"num_stages": 2,
6464
"waves_per_eu": 0,
6565
"matrix_instr_nonkdim": 16,
6666
"kpack": 2
6767
},
6868
"32": {
6969
"BLOCK_SIZE_M": 16,
70-
"BLOCK_SIZE_N": 16,
71-
"BLOCK_SIZE_K": 128,
70+
"BLOCK_SIZE_N": 32,
71+
"BLOCK_SIZE_K": 256,
7272
"GROUP_SIZE_M": 4,
7373
"num_warps": 2,
7474
"num_stages": 2,
7575
"waves_per_eu": 0,
7676
"matrix_instr_nonkdim": 16,
77-
"kpack": 1
77+
"kpack": 2
7878
},
7979
"48": {
8080
"BLOCK_SIZE_M": 16,
81-
"BLOCK_SIZE_N": 16,
81+
"BLOCK_SIZE_N": 64,
8282
"BLOCK_SIZE_K": 128,
8383
"GROUP_SIZE_M": 4,
84-
"num_warps": 2,
84+
"num_warps": 4,
8585
"num_stages": 2,
8686
"waves_per_eu": 0,
8787
"matrix_instr_nonkdim": 16,
88-
"kpack": 2
88+
"kpack": 1
8989
},
9090
"64": {
9191
"BLOCK_SIZE_M": 32,
9292
"BLOCK_SIZE_N": 64,
9393
"BLOCK_SIZE_K": 128,
9494
"GROUP_SIZE_M": 4,
95-
"num_warps": 8,
95+
"num_warps": 4,
9696
"num_stages": 2,
9797
"waves_per_eu": 0,
9898
"matrix_instr_nonkdim": 16,
9999
"kpack": 2
100100
},
101101
"96": {
102102
"BLOCK_SIZE_M": 32,
103-
"BLOCK_SIZE_N": 32,
104-
"BLOCK_SIZE_K": 128,
103+
"BLOCK_SIZE_N": 64,
104+
"BLOCK_SIZE_K": 256,
105105
"GROUP_SIZE_M": 4,
106-
"num_warps": 4,
106+
"num_warps": 8,
107107
"num_stages": 2,
108108
"waves_per_eu": 0,
109109
"matrix_instr_nonkdim": 16,
110-
"kpack": 2
110+
"kpack": 1
111111
},
112112
"128": {
113113
"BLOCK_SIZE_M": 64,
114114
"BLOCK_SIZE_N": 64,
115-
"BLOCK_SIZE_K": 64,
115+
"BLOCK_SIZE_K": 128,
116116
"GROUP_SIZE_M": 4,
117-
"num_warps": 8,
117+
"num_warps": 4,
118118
"num_stages": 2,
119119
"waves_per_eu": 0,
120-
"matrix_instr_nonkdim": 16,
120+
"matrix_instr_nonkdim": 32,
121121
"kpack": 2
122122
},
123123
"256": {
@@ -129,7 +129,7 @@
129129
"num_stages": 2,
130130
"waves_per_eu": 0,
131131
"matrix_instr_nonkdim": 16,
132-
"kpack": 1
132+
"kpack": 2
133133
},
134134
"512": {
135135
"BLOCK_SIZE_M": 128,
@@ -150,7 +150,7 @@
150150
"num_warps": 8,
151151
"num_stages": 2,
152152
"waves_per_eu": 0,
153-
"matrix_instr_nonkdim": 32,
153+
"matrix_instr_nonkdim": 16,
154154
"kpack": 2
155155
},
156156
"1536": {
@@ -184,7 +184,7 @@
184184
"num_stages": 2,
185185
"waves_per_eu": 0,
186186
"matrix_instr_nonkdim": 16,
187-
"kpack": 1
187+
"kpack": 2
188188
},
189189
"4096": {
190190
"BLOCK_SIZE_M": 128,
@@ -195,6 +195,6 @@
195195
"num_stages": 2,
196196
"waves_per_eu": 0,
197197
"matrix_instr_nonkdim": 16,
198-
"kpack": 1
198+
"kpack": 2
199199
}
200200
}

0 commit comments

Comments
 (0)