15
15
#include < iterator>
16
16
#include < algorithm>
17
17
18
- float tensor_sum_elements (struct ggml_tensor * tensor) {
18
+ float tensor_sum_elements (const ggml_tensor * tensor) {
19
19
float sum = 0 ;
20
20
if (tensor->type ==GGML_TYPE_F32) {
21
21
for (int j = 0 ; j < tensor->ne [1 ]; j++) {
@@ -27,21 +27,15 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
27
27
return sum;
28
28
}
29
29
30
+ void tensor_dump (const ggml_tensor * tensor, const char * name) {
31
+ printf (" %15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - " , name,
32
+ tensor->type , ggml_type_name (tensor->type ),
33
+ (int ) tensor->ne [0 ], (int ) tensor->ne [1 ], (int ) tensor->ne [2 ], tensor->nb [0 ], tensor->nb [1 ], tensor->nb [2 ]);
34
+ float sum = tensor_sum_elements (tensor);
35
+ printf (" Sum of tensor %s is %6.2f\n " , name, sum);
36
+ }
30
37
31
- /*
32
- These are mapping to unknown
33
- GGML_TYPE_I8,
34
- GGML_TYPE_I16,
35
- GGML_TYPE_I32,
36
- GGML_TYPE_COUNT,
37
- */
38
-
39
- #define TENSOR_TYPE_AS_STR (TYPE ) TYPE == GGML_TYPE_F32 ? " FP32" : TYPE == GGML_TYPE_F16 ? " FP16" : TYPE == GGML_TYPE_Q4_0 ? " Q4_0" : TYPE == GGML_TYPE_Q4_1 ? " Q4_1" : " UNKNOWN"
40
-
41
- #define TENSOR_DUMP (TENSOR ) printf(" %15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - " , #TENSOR, \
42
- TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
43
- (int ) TENSOR->ne[0 ], (int ) TENSOR->ne[1 ], (int ) TENSOR->ne[2 ], TENSOR->nb[0 ], TENSOR->nb[1 ], TENSOR->nb[2 ]); \
44
- { float sum = tensor_sum_elements (TENSOR); printf (" Sum of tensor %s is %6.2f\n " ,#TENSOR, sum); }
38
+ #define TENSOR_DUMP (tensor ) tensor_dump(tensor, #tensor)
45
39
46
40
struct benchmark_params_struct {
47
41
int32_t n_threads = 1 ;
@@ -59,8 +53,6 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para
59
53
}
60
54
61
55
int main (int argc, char ** argv) {
62
-
63
-
64
56
struct benchmark_params_struct benchmark_params;
65
57
66
58
bool invalid_param = false ;
@@ -84,11 +76,11 @@ int main(int argc, char ** argv) {
84
76
print_usage (argc, argv, benchmark_params);
85
77
exit (0 );
86
78
}
87
- if (invalid_param) {
88
- fprintf (stderr, " error: invalid parameter for argument: %s \n " , arg. c_str ());
89
- print_usage (argc, argv, benchmark_params );
90
- exit ( 1 );
91
- }
79
+ }
80
+ if (invalid_param) {
81
+ fprintf (stderr, " error: invalid parameter for argument: %s \n " , arg. c_str () );
82
+ print_usage (argc, argv, benchmark_params );
83
+ exit ( 1 );
92
84
}
93
85
94
86
fprintf (stderr, " %s: build = %d (%s)\n " , __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -216,9 +208,8 @@ int main(int argc, char ** argv) {
216
208
// Let's use the F32 result from above as a reference for the q4_0 multiplication
217
209
float sum_of_F32_reference = tensor_sum_elements (gf.nodes [0 ]);
218
210
219
-
220
- printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n " );
221
- printf (" ==============================================================================================\n " );
211
+ printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n " );
212
+ printf (" =====================================================================================\n " );
222
213
223
214
for (int i=0 ;i<benchmark_params.n_iterations ;i++) {
224
215
@@ -227,12 +218,12 @@ int main(int argc, char ** argv) {
227
218
ggml_graph_compute (ctx, &gf31);
228
219
long long int stop = ggml_time_us ();
229
220
long long int usec = stop-start;
230
- float flops_per_usec = (1 . 0f * flops_per_matrix)/usec;
231
- printf (" %9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19 .2f\n " ,
221
+ double gflops = (double )( flops_per_matrix)/usec/ 1000.0 ;
222
+ printf (" %9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10 .2f\n " ,
232
223
i,
233
224
gf31.n_threads ,
234
225
sizex, sizey, sizez, flops_per_matrix,
235
- usec,flops_per_usec );
226
+ usec,gflops );
236
227
237
228
#ifdef VERBOSE_DEBUGGING
238
229
TENSOR_DUMP (" res" ,gf31.nodes [0 ])
@@ -256,7 +247,5 @@ int main(int argc, char ** argv) {
256
247
257
248
// Running a different graph computation to make sure we override the CPU cache lines
258
249
ggml_graph_compute (ctx, &gf32);
259
-
260
250
}
261
-
262
251
}
0 commit comments