Apply ggerganov's fixes for test-backend-ops

jart · jart · commit 2741a99791cb · 2024-05-07T22:26:27.000-07:00
diff --git a/ggml-metal.m b/ggml-metal.m
@@ -803,7 +803,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_GET_ROWS:
             {
-                return op->ne[3] == 1;
+                return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
             }
         default:
             return false;
diff --git a/ggml.c b/ggml.c
@@ -19765,7 +19765,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
             case GGML_OP_CPY:
             case GGML_OP_DUP:
                 {
-                    if (ggml_is_quantized(node->type)) {
+                    if (ggml_is_quantized(node->type) ||
+                        // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
+                        (node->src[0]->type == GGML_TYPE_F16  && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
+                        (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
                         cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                     }
                 } break;
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -863,7 +863,6 @@ def get_type(val: Any) -> GGUFValueType:
 GGML_QUANT_SIZES = {
     GGMLQuantizationType.F32:     (1, 4),
     GGMLQuantizationType.F16:     (1, 2),
-    GGMLQuantizationType.BF16:    (1, 2),
     GGMLQuantizationType.Q4_0:    (32, 2 + 16),
     GGMLQuantizationType.Q4_1:    (32, 2 + 2 + 16),
     GGMLQuantizationType.Q5_0:    (32, 2 + 4 + 16),
@@ -890,6 +889,7 @@ def get_type(val: Any) -> GGUFValueType:
     GGMLQuantizationType.I64:     (1, 8),
     GGMLQuantizationType.F64:     (1, 8),
     GGMLQuantizationType.IQ1_M:   (256, QK_K // 8 + QK_K // 16  + QK_K // 32),
+    GGMLQuantizationType.BF16:    (1, 2),
 }
 
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -50,7 +50,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
 
     if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
         ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
-    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
+    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
         GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
         std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
         std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
@@ -92,6 +92,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                     size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
                     if (t->type == GGML_TYPE_F16) {
                         tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
+                    } else if (t->type == GGML_TYPE_BF16) {
+                        tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
                     } else if (t->type == GGML_TYPE_F32) {
                         tv.push_back(*(float *) &buf[i]);
                     } else if (t->type == GGML_TYPE_I32) {
@@ -1898,7 +1900,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     std::default_random_engine rng(0);
 
     const ggml_type all_types[] = {
-        GGML_TYPE_F32, GGML_TYPE_F16,
+        GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
         GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
         GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
         GGML_TYPE_Q8_0,

Original file line number	Diff line number	Diff line change
`@@ -803,7 +803,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const`
`803`	`803`	`case GGML_OP_DIAG_MASK_INF:`
`804`	`804`	`case GGML_OP_GET_ROWS:`
`805`	`805`	`{`
`806`		`- return op->ne[3] == 1;`
	`806`	`+ return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;`
`807`	`807`	`}`
`808`	`808`	`default:`
`809`	`809`	`return false;`
Original file line number	Diff line number	Diff line change
`@@ -19765,7 +19765,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa`
`19765`	`19765`	`case GGML_OP_CPY:`
`19766`	`19766`	`case GGML_OP_DUP:`
`19767`	`19767`	`{`
`19768`		`- if (ggml_is_quantized(node->type)) {`
	`19768`	`+ if (ggml_is_quantized(node->type) \|\|`
	`19769`	`+ // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32`
	`19770`	`+ (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) \|\|`
	`19771`	`+ (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {`
`19769`	`19772`	`cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;`
`19770`	`19773`	`}`
`19771`	`19774`	`} break;`