Skip to content

Commit a264693

Browse files
authored
Updated fused_moe configs for MI325X with Triton 3.2 (#345)
1 parent ca4d670 commit a264693

16 files changed

+288
-288
lines changed

vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
{
22
"1": {
33
"BLOCK_SIZE_M": 16,
4-
"BLOCK_SIZE_N": 64,
4+
"BLOCK_SIZE_N": 16,
55
"BLOCK_SIZE_K": 256,
66
"GROUP_SIZE_M": 1,
7-
"num_warps": 4,
7+
"num_warps": 2,
88
"num_stages": 2,
99
"waves_per_eu": 0
1010
},
1111
"2": {
1212
"BLOCK_SIZE_M": 16,
13-
"BLOCK_SIZE_N": 64,
13+
"BLOCK_SIZE_N": 32,
1414
"BLOCK_SIZE_K": 256,
1515
"GROUP_SIZE_M": 1,
1616
"num_warps": 4,
@@ -19,10 +19,10 @@
1919
},
2020
"4": {
2121
"BLOCK_SIZE_M": 16,
22-
"BLOCK_SIZE_N": 32,
23-
"BLOCK_SIZE_K": 128,
22+
"BLOCK_SIZE_N": 16,
23+
"BLOCK_SIZE_K": 256,
2424
"GROUP_SIZE_M": 1,
25-
"num_warps": 2,
25+
"num_warps": 4,
2626
"num_stages": 2,
2727
"waves_per_eu": 0
2828
},
@@ -31,12 +31,12 @@
3131
"BLOCK_SIZE_N": 64,
3232
"BLOCK_SIZE_K": 256,
3333
"GROUP_SIZE_M": 1,
34-
"num_warps": 4,
34+
"num_warps": 2,
3535
"num_stages": 2,
3636
"waves_per_eu": 0
3737
},
3838
"16": {
39-
"BLOCK_SIZE_M": 16,
39+
"BLOCK_SIZE_M": 64,
4040
"BLOCK_SIZE_N": 64,
4141
"BLOCK_SIZE_K": 256,
4242
"GROUP_SIZE_M": 1,
@@ -45,11 +45,11 @@
4545
"waves_per_eu": 0
4646
},
4747
"24": {
48-
"BLOCK_SIZE_M": 16,
48+
"BLOCK_SIZE_M": 32,
4949
"BLOCK_SIZE_N": 64,
5050
"BLOCK_SIZE_K": 256,
5151
"GROUP_SIZE_M": 1,
52-
"num_warps": 4,
52+
"num_warps": 2,
5353
"num_stages": 2,
5454
"waves_per_eu": 0
5555
},
@@ -58,23 +58,23 @@
5858
"BLOCK_SIZE_N": 32,
5959
"BLOCK_SIZE_K": 256,
6060
"GROUP_SIZE_M": 4,
61-
"num_warps": 1,
61+
"num_warps": 2,
6262
"num_stages": 2,
6363
"waves_per_eu": 0
6464
},
6565
"48": {
66-
"BLOCK_SIZE_M": 32,
66+
"BLOCK_SIZE_M": 16,
6767
"BLOCK_SIZE_N": 64,
6868
"BLOCK_SIZE_K": 256,
6969
"GROUP_SIZE_M": 1,
70-
"num_warps": 2,
70+
"num_warps": 4,
7171
"num_stages": 2,
7272
"waves_per_eu": 0
7373
},
7474
"64": {
7575
"BLOCK_SIZE_M": 32,
76-
"BLOCK_SIZE_N": 64,
77-
"BLOCK_SIZE_K": 256,
76+
"BLOCK_SIZE_N": 16,
77+
"BLOCK_SIZE_K": 128,
7878
"GROUP_SIZE_M": 4,
7979
"num_warps": 2,
8080
"num_stages": 2,
@@ -127,8 +127,8 @@
127127
},
128128
"1536": {
129129
"BLOCK_SIZE_M": 256,
130-
"BLOCK_SIZE_N": 128,
131-
"BLOCK_SIZE_K": 128,
130+
"BLOCK_SIZE_N": 256,
131+
"BLOCK_SIZE_K": 64,
132132
"GROUP_SIZE_M": 1,
133133
"num_warps": 8,
134134
"num_stages": 2,
@@ -144,9 +144,9 @@
144144
"waves_per_eu": 0
145145
},
146146
"3072": {
147-
"BLOCK_SIZE_M": 256,
147+
"BLOCK_SIZE_M": 128,
148148
"BLOCK_SIZE_N": 256,
149-
"BLOCK_SIZE_K": 64,
149+
"BLOCK_SIZE_K": 128,
150150
"GROUP_SIZE_M": 1,
151151
"num_warps": 8,
152152
"num_stages": 2,

vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325_OAM.json

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@
3434
},
3535
"8": {
3636
"BLOCK_SIZE_M": 16,
37-
"BLOCK_SIZE_N": 16,
38-
"BLOCK_SIZE_K": 256,
37+
"BLOCK_SIZE_N": 32,
38+
"BLOCK_SIZE_K": 128,
3939
"GROUP_SIZE_M": 1,
40-
"num_warps": 1,
40+
"num_warps": 2,
4141
"num_stages": 2,
4242
"waves_per_eu": 0,
4343
"matrix_instr_nonkdim": 16,
@@ -56,25 +56,25 @@
5656
},
5757
"24": {
5858
"BLOCK_SIZE_M": 16,
59-
"BLOCK_SIZE_N": 16,
60-
"BLOCK_SIZE_K": 256,
59+
"BLOCK_SIZE_N": 128,
60+
"BLOCK_SIZE_K": 64,
6161
"GROUP_SIZE_M": 1,
62-
"num_warps": 1,
62+
"num_warps": 4,
6363
"num_stages": 2,
6464
"waves_per_eu": 0,
6565
"matrix_instr_nonkdim": 16,
66-
"kpack": 1
66+
"kpack": 2
6767
},
6868
"32": {
6969
"BLOCK_SIZE_M": 16,
7070
"BLOCK_SIZE_N": 64,
71-
"BLOCK_SIZE_K": 256,
71+
"BLOCK_SIZE_K": 128,
7272
"GROUP_SIZE_M": 4,
7373
"num_warps": 4,
7474
"num_stages": 2,
7575
"waves_per_eu": 0,
7676
"matrix_instr_nonkdim": 16,
77-
"kpack": 1
77+
"kpack": 2
7878
},
7979
"48": {
8080
"BLOCK_SIZE_M": 16,
@@ -96,7 +96,7 @@
9696
"num_stages": 2,
9797
"waves_per_eu": 0,
9898
"matrix_instr_nonkdim": 16,
99-
"kpack": 1
99+
"kpack": 2
100100
},
101101
"96": {
102102
"BLOCK_SIZE_M": 32,
@@ -117,13 +117,13 @@
117117
"num_warps": 4,
118118
"num_stages": 2,
119119
"waves_per_eu": 0,
120-
"matrix_instr_nonkdim": 16,
120+
"matrix_instr_nonkdim": 32,
121121
"kpack": 2
122122
},
123123
"256": {
124124
"BLOCK_SIZE_M": 128,
125125
"BLOCK_SIZE_N": 128,
126-
"BLOCK_SIZE_K": 128,
126+
"BLOCK_SIZE_K": 64,
127127
"GROUP_SIZE_M": 4,
128128
"num_warps": 8,
129129
"num_stages": 2,
@@ -132,10 +132,10 @@
132132
"kpack": 2
133133
},
134134
"512": {
135-
"BLOCK_SIZE_M": 256,
135+
"BLOCK_SIZE_M": 128,
136136
"BLOCK_SIZE_N": 128,
137137
"BLOCK_SIZE_K": 64,
138-
"GROUP_SIZE_M": 4,
138+
"GROUP_SIZE_M": 1,
139139
"num_warps": 8,
140140
"num_stages": 2,
141141
"waves_per_eu": 0,
@@ -144,7 +144,7 @@
144144
},
145145
"1024": {
146146
"BLOCK_SIZE_M": 128,
147-
"BLOCK_SIZE_N": 256,
147+
"BLOCK_SIZE_N": 128,
148148
"BLOCK_SIZE_K": 64,
149149
"GROUP_SIZE_M": 1,
150150
"num_warps": 8,
@@ -188,13 +188,13 @@
188188
},
189189
"4096": {
190190
"BLOCK_SIZE_M": 128,
191-
"BLOCK_SIZE_N": 256,
191+
"BLOCK_SIZE_N": 128,
192192
"BLOCK_SIZE_K": 64,
193193
"GROUP_SIZE_M": 1,
194194
"num_warps": 8,
195195
"num_stages": 2,
196196
"waves_per_eu": 0,
197197
"matrix_instr_nonkdim": 16,
198-
"kpack": 1
198+
"kpack": 2
199199
}
200200
}

vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8.json

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
},
2020
"4": {
2121
"BLOCK_SIZE_M": 16,
22-
"BLOCK_SIZE_N": 64,
22+
"BLOCK_SIZE_N": 32,
2323
"BLOCK_SIZE_K": 256,
2424
"GROUP_SIZE_M": 1,
2525
"num_warps": 2,
@@ -126,27 +126,27 @@
126126
"waves_per_eu": 0
127127
},
128128
"1536": {
129-
"BLOCK_SIZE_M": 128,
129+
"BLOCK_SIZE_M": 256,
130130
"BLOCK_SIZE_N": 128,
131131
"BLOCK_SIZE_K": 128,
132132
"GROUP_SIZE_M": 1,
133-
"num_warps": 4,
133+
"num_warps": 8,
134134
"num_stages": 2,
135135
"waves_per_eu": 0
136136
},
137137
"2048": {
138-
"BLOCK_SIZE_M": 256,
138+
"BLOCK_SIZE_M": 128,
139139
"BLOCK_SIZE_N": 256,
140-
"BLOCK_SIZE_K": 64,
140+
"BLOCK_SIZE_K": 128,
141141
"GROUP_SIZE_M": 1,
142142
"num_warps": 8,
143143
"num_stages": 2,
144144
"waves_per_eu": 0
145145
},
146146
"3072": {
147-
"BLOCK_SIZE_M": 256,
147+
"BLOCK_SIZE_M": 128,
148148
"BLOCK_SIZE_N": 256,
149-
"BLOCK_SIZE_K": 64,
149+
"BLOCK_SIZE_K": 128,
150150
"GROUP_SIZE_M": 1,
151151
"num_warps": 8,
152152
"num_stages": 2,

vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325_OAM.json

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"BLOCK_SIZE_N": 16,
55
"BLOCK_SIZE_K": 256,
66
"GROUP_SIZE_M": 1,
7-
"num_warps": 2,
7+
"num_warps": 4,
88
"num_stages": 2,
99
"waves_per_eu": 0,
1010
"matrix_instr_nonkdim": 16,
@@ -44,7 +44,7 @@
4444
"kpack": 2
4545
},
4646
"16": {
47-
"BLOCK_SIZE_M": 16,
47+
"BLOCK_SIZE_M": 32,
4848
"BLOCK_SIZE_N": 32,
4949
"BLOCK_SIZE_K": 256,
5050
"GROUP_SIZE_M": 1,
@@ -56,8 +56,8 @@
5656
},
5757
"24": {
5858
"BLOCK_SIZE_M": 16,
59-
"BLOCK_SIZE_N": 32,
60-
"BLOCK_SIZE_K": 256,
59+
"BLOCK_SIZE_N": 64,
60+
"BLOCK_SIZE_K": 128,
6161
"GROUP_SIZE_M": 1,
6262
"num_warps": 2,
6363
"num_stages": 2,
@@ -118,7 +118,7 @@
118118
"num_stages": 2,
119119
"waves_per_eu": 0,
120120
"matrix_instr_nonkdim": 16,
121-
"kpack": 2
121+
"kpack": 1
122122
},
123123
"256": {
124124
"BLOCK_SIZE_M": 128,
@@ -136,7 +136,7 @@
136136
"BLOCK_SIZE_N": 128,
137137
"BLOCK_SIZE_K": 64,
138138
"GROUP_SIZE_M": 1,
139-
"num_warps": 2,
139+
"num_warps": 8,
140140
"num_stages": 2,
141141
"waves_per_eu": 0,
142142
"matrix_instr_nonkdim": 16,
@@ -166,31 +166,31 @@
166166
},
167167
"2048": {
168168
"BLOCK_SIZE_M": 128,
169-
"BLOCK_SIZE_N": 256,
169+
"BLOCK_SIZE_N": 128,
170170
"BLOCK_SIZE_K": 64,
171-
"GROUP_SIZE_M": 1,
171+
"GROUP_SIZE_M": 4,
172172
"num_warps": 8,
173173
"num_stages": 2,
174174
"waves_per_eu": 0,
175175
"matrix_instr_nonkdim": 16,
176-
"kpack": 1
176+
"kpack": 2
177177
},
178178
"3072": {
179179
"BLOCK_SIZE_M": 128,
180180
"BLOCK_SIZE_N": 128,
181181
"BLOCK_SIZE_K": 64,
182-
"GROUP_SIZE_M": 8,
182+
"GROUP_SIZE_M": 16,
183183
"num_warps": 8,
184184
"num_stages": 2,
185185
"waves_per_eu": 0,
186186
"matrix_instr_nonkdim": 16,
187187
"kpack": 2
188188
},
189189
"4096": {
190-
"BLOCK_SIZE_M": 256,
191-
"BLOCK_SIZE_N": 256,
190+
"BLOCK_SIZE_M": 128,
191+
"BLOCK_SIZE_N": 128,
192192
"BLOCK_SIZE_K": 64,
193-
"GROUP_SIZE_M": 1,
193+
"GROUP_SIZE_M": 8,
194194
"num_warps": 8,
195195
"num_stages": 2,
196196
"waves_per_eu": 0,

0 commit comments

Comments
 (0)