@@ -20,8 +20,6 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
20
20
size_t size = ggml_nelements (tensor);
21
21
std::vector<float > data (size);
22
22
23
- std::random_device rd;
24
-
25
23
#if 0
26
24
std::default_random_engine generator(rd());
27
25
std::uniform_real_distribution<float> distribution(min, max);
@@ -31,6 +29,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
31
29
}
32
30
#endif
33
31
auto init_thread = [&](size_t start, size_t end) {
32
+ std::random_device rd;
34
33
std::default_random_engine generator (rd ());
35
34
std::uniform_real_distribution<float > distribution (min, max);
36
35
@@ -341,13 +340,6 @@ struct test_case {
341
340
}
342
341
}
343
342
344
- // if (t1->op == GGML_OP_SOFT_MAX) {
345
- // printf("[%s] ", ggml_op_desc(t1));
346
- // for (int i = 0; i < f1.size(); i++) {
347
- // printf("(%x, %x) ", *(uint32_t*)&f1[i], *(uint32_t*)&f2[i]);
348
- // }
349
- // printf("\n");
350
- // }
351
343
double err = nmse (f1.data (), f2.data (), f1.size ());
352
344
if (err > ud->max_err ) {
353
345
printf (" [%s] NMSE = %f " , ggml_op_desc (t1), err);
@@ -447,8 +439,9 @@ struct test_case {
447
439
return size;
448
440
};
449
441
for (int i = 0 ; i < gf->n_nodes ; i++) {
450
- if (ggml_is_view_op (gf->nodes [i]->op ) || gf->nodes [i] == out)
442
+ if (ggml_is_view_op (gf->nodes [i]->op ) || gf->nodes [i] == out) {
451
443
continue ;
444
+ }
452
445
mem += tensor_op_size (gf->nodes [i]);
453
446
}
454
447
@@ -1137,23 +1130,26 @@ struct test_sum_rows : public test_case {
1137
1130
}
1138
1131
};
1139
1132
1133
+ // Mixtral MOE
1140
1134
struct test_moe : public test_case {
1141
- const int n_experts = 8 ;
1142
- const int n_experts_per_tok = 2 ;
1143
- const int n_tokens = 1 ;
1144
- const int n_embd = 4096 ;
1145
- const int n_ff = 14336 ;
1135
+ const int n_experts;
1136
+ const int n_experts_per_tok;
1137
+ const int n_tokens;
1138
+ const int n_embd;
1139
+ const int n_ff;
1146
1140
1147
1141
std::string op_desc (ggml_tensor * t) override {
1148
1142
return " MOE" ;
1143
+
1149
1144
GGML_UNUSED (t);
1150
1145
}
1151
1146
1152
1147
std::string vars () override {
1153
1148
return VARS_TO_STR5 (n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
1154
1149
}
1155
1150
1156
- test_moe () {
1151
+ test_moe (int n_experts = 8 , int n_experts_per_tok = 2 , int n_tokens = 1 , int n_embd = 4096 , int n_ff = 14336 )
1152
+ : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
1157
1153
}
1158
1154
1159
1155
ggml_tensor * build_graph (ggml_context * ctx) override {
@@ -1171,24 +1167,20 @@ struct test_moe : public test_case {
1171
1167
1172
1168
ggml_tensor * cur = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, n_embd, n_tokens);
1173
1169
1174
- ggml_tensor * logits = ggml_mul_mat (ctx, ffn_gate_inp, cur); // [n_tokens, num_experts]
1175
- ggml_tensor * probs = ggml_soft_max_ext (ctx, logits, nullptr , 1 .0f /sqrtf (n_embd)); // [n_tokens, num_experts]
1170
+ ggml_tensor * logits = ggml_mul_mat (ctx, ffn_gate_inp, cur);
1171
+ ggml_tensor * probs = ggml_soft_max_ext (ctx, logits, nullptr , 1 .0f /sqrtf (n_embd));
1176
1172
1177
1173
// select experts
1178
- ggml_tensor * selected_experts = ggml_top_k (ctx, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok]
1174
+ ggml_tensor * selected_experts = ggml_top_k (ctx, probs, n_experts_per_tok);
1179
1175
1180
1176
ggml_tensor * weights = ggml_get_rows (ctx,
1181
1177
ggml_reshape_3d (ctx, probs, 1 , n_experts, n_tokens), selected_experts);
1182
- printf (" get rows args %ld %ld %ld %ld, %ld %ld %ld %ld\n " ,
1183
- weights->src [0 ]->ne [0 ], weights->src [0 ]->ne [1 ], weights->src [0 ]->ne [2 ], weights->src [0 ]->ne [3 ],
1184
- weights->src [1 ]->ne [0 ], weights->src [1 ]->ne [1 ], weights->src [1 ]->ne [2 ], weights->src [1 ]->ne [3 ]);
1185
1178
1186
-
1187
- weights = ggml_reshape_2d (ctx, weights, n_experts_per_tok, n_tokens); // [n_tokens, num_experts_per_tok]
1179
+ weights = ggml_reshape_2d (ctx, weights, n_experts_per_tok, n_tokens);
1188
1180
1189
1181
ggml_tensor * weights_sum = ggml_sum_rows (ctx, weights);
1190
1182
1191
- weights = ggml_div (ctx, weights, weights_sum); // [n_tokens, num_experts_per_tok]
1183
+ weights = ggml_div (ctx, weights, weights_sum);
1192
1184
1193
1185
// compute expert outputs
1194
1186
ggml_tensor * moe_out = nullptr ;
@@ -1202,9 +1194,9 @@ struct test_moe : public test_case {
1202
1194
1203
1195
cur_gate = ggml_silu (ctx, cur_gate);
1204
1196
1205
- cur_expert = ggml_mul (ctx, cur_up, cur_gate); // [n_tokens, n_embd]
1197
+ cur_expert = ggml_mul (ctx, cur_up, cur_gate);
1206
1198
1207
- cur_expert = ggml_mul_mat_id (ctx, ffn_down_exp.data (), n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd]
1199
+ cur_expert = ggml_mul_mat_id (ctx, ffn_down_exp.data (), n_experts, selected_experts, i, cur_expert);
1208
1200
1209
1201
cur_expert = ggml_mul (ctx, cur_expert,
1210
1202
ggml_view_2d (ctx, weights, 1 , n_tokens, weights->nb [1 ], i*weights->nb [0 ]));
@@ -1240,8 +1232,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
1240
1232
GGML_TYPE_Q6_K
1241
1233
};
1242
1234
1243
- test_cases.emplace_back (new test_moe ());
1244
-
1245
1235
// unary ops
1246
1236
for (int op = 0 ; op < GGML_UNARY_OP_COUNT; op++) {
1247
1237
test_cases.emplace_back (new test_unary ((ggml_unary_op) op));
@@ -1374,6 +1364,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
1374
1364
1375
1365
test_cases.emplace_back (new test_sum_rows ());
1376
1366
1367
+ test_cases.emplace_back (new test_moe (8 , 2 , 1 , 4096 , 14336 ));
1368
+ test_cases.emplace_back (new test_moe (8 , 2 , 8 , 4096 , 14336 ));
1369
+
1377
1370
// run tests
1378
1371
if (mode == MODE_TEST) {
1379
1372
ggml_backend_t backend_cpu = ggml_backend_cpu_init ();
@@ -1389,14 +1382,17 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
1389
1382
ggml_backend_free (backend_cpu);
1390
1383
1391
1384
return n_ok == test_cases.size ();
1392
- } else if (mode == MODE_PERF) {
1385
+ }
1386
+
1387
+ if (mode == MODE_PERF) {
1393
1388
for (auto & test : test_cases) {
1394
1389
test->eval_perf (backend, op_name);
1395
1390
}
1396
1391
return true ;
1397
- } else {
1398
- GGML_ASSERT (false );
1399
1392
}
1393
+
1394
+ GGML_ASSERT (false );
1395
+ return false ;
1400
1396
}
1401
1397
1402
1398
static void usage (char ** argv) {
@@ -1469,11 +1465,12 @@ int main(int argc, char ** argv) {
1469
1465
}
1470
1466
1471
1467
printf (" %zu/%zu backends passed\n " , n_ok, ggml_backend_reg_get_count ());
1468
+
1472
1469
if (n_ok != ggml_backend_reg_get_count ()) {
1473
1470
printf (" \033 [1;31mFAIL\033 [0m\n " );
1474
1471
return 1 ;
1475
- } else {
1476
- printf (" \033 [1;32mOK\033 [0m\n " );
1477
- return 0 ;
1478
1472
}
1473
+
1474
+ printf (" \033 [1;32mOK\033 [0m\n " );
1475
+ return 0 ;
1479
1476
}
0 commit comments