Fix merge

SlyEcho · ardfork · KerfuffleV2 · YellowRoseCx · commit 9dba0c985f14 · 2023-08-10T13:05:51.000-05:00
---------

Co-authored-by: ardfork &lt;134447697+ardfork@users.noreply.github.com&gt;
Co-authored-by: Kerfuffle &lt;44031344+KerfuffleV2@users.noreply.github.com&gt;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -1641,8 +1641,8 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
 #else
     const float2 dm8f = __half22float2(dm8);
     const float2 ds8f = __half22float2(ds8);
-    const float d8d8 = dm8f.x * ds8f.x;
-    const float m8s8 = dm8f.y * ds8f.y;
+    const float d8d8 = __low2float(dm8) * __low2float(ds8);
+    const float m8s8 = __high2float(dm8) * __high2float(ds8);
 #endif // GGML_CUDA_F16
 
     // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
@@ -3281,7 +3281,7 @@ static __global__ void mul_mat_q(
                     *dsi_dst = *dsi_src;
                 } else {
                     float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = (*dsi_src).x;
+                    *dfi_dst = __low2half(*dsi_src);
                 }
             }