Can anyone guide me on how to further optimize my SVE kernel? #12771

kuoihao · 2025-04-05T12:28:56Z

kuoihao
Apr 5, 2025

🙏 Request for Help

I am stuck. The performance is consistently below the upstream version, despite my best efforts to use SVE efficiently. I would be very grateful if anyone can:

Suggest potential optimization strategies for 128-bit SVE
Point out common mistakes when porting from NEON to SVE
Share similar experience with optimizing kernels on Dimensity SoCs

Any feedback or direction would be deeply appreciated. Thank you!

And I have a question: for example, using svmul_f32 with SVE doesn’t seem to be faster than a regular scalar multiplication in my code.

🧩 System Information

Device: iQOO Neo10
SoC: MediaTek Dimensity 9400
Memory: 16 GB RAM, 512 GB storage
ISA: Armv9 with SVE 128-bit support
Model tested: Qwen2.5-1.5B-Instruct-Q4_0.gguf
FA mode: 1
Threads: 4, 8
llama.cpp build: 7a84777 (5054)

This is my code:

static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 4;
    const int blocklen = 8;

    assert (n % qk == 0);
    assert (nc % ncols_interleaved == 0);

    UNUSED(s);
    UNUSED(bs);
    UNUSED(vx);
    UNUSED(vy);
    UNUSED(nr);
    UNUSED(nc);
    UNUSED(nb);
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);

//mycode begin
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE) 
if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0/2) {
    const svbool_t pg8=svptrue_b8();
    const svbool_t pg32=svptrue_b32();
    static const uint32_t indices[] = {0, 2, 1, 3};
    const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
    #pragma unroll(4)
    for (int c = 0; c < nc; c += ncols_interleaved) {
        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
        svfloat32_t acc= svdup_n_f32(0.0f);
        #pragma unroll(4)
        for (int b = 0; b < nb; b++) {
            svint8_t b0 = svldnt1_s8(pg8,(const int8_t *) b_ptr->qs);
            svint8_t b1 = svldnt1_s8(pg8,(const int8_t *) b_ptr->qs+16);
            svint8_t b2 = svldnt1_s8(pg8,(const int8_t *) b_ptr->qs+32);
            svint8_t b3 = svldnt1_s8(pg8,(const int8_t *) b_ptr->qs+48);

            svint8_t a0 =svreinterpret_s8_u64(svdup_n_u64(*((const uint64_t *)a_ptr->qs)));
            svint8_t a1 =svreinterpret_s8_u64(svdup_n_u64(*((const uint64_t *)a_ptr->qs+1)));
            svint8_t a2 =svreinterpret_s8_u64(svdup_n_u64(*((const uint64_t *)a_ptr->qs+2)));
            svint8_t a3 =svreinterpret_s8_u64(svdup_n_u64(*((const uint64_t *)a_ptr->qs+3)));

            float32_t scale[]={GGML_FP16_TO_FP32(b_ptr->d[0])*GGML_FP16_TO_FP32(a_ptr->d),
                                GGML_FP16_TO_FP32(b_ptr->d[1])*GGML_FP16_TO_FP32(a_ptr->d),
                                GGML_FP16_TO_FP32(b_ptr->d[2])*GGML_FP16_TO_FP32(a_ptr->d),
                                GGML_FP16_TO_FP32(b_ptr->d[3])*GGML_FP16_TO_FP32(a_ptr->d)};

            svint32_t ret0 = svdup_n_s32(0);
            svint32_t ret1 = svdup_n_s32(0);

            ret0 = svdot_s32(ret0, b0 << 4, a0);
            ret1 = svdot_s32(ret1, b1 << 4, a0);
            ret0 = svdot_s32(ret0, b2 << 4, a1);
            ret1 = svdot_s32(ret1, b3 << 4, a1);

            ret0 = svdot_s32(ret0, b0 & 0xf0U, a2);
            ret1 = svdot_s32(ret1, b1 & 0xf0U, a2);
            ret0 = svdot_s32(ret0, b2 & 0xf0U, a3);
            ret1 = svdot_s32(ret1, b3 & 0xf0U, a3);

            svint32_t ret = svtbl_s32(svaddp_s32_m(pg32, ret0, ret1),svld1_u32(pg32,indices));
            acc = svmla_f32_x(pg32,acc,svcvt_f32_s32_x(pg32,ret>>4),svld1(pg32,scale));
            a_ptr++;
            b_ptr++;
        }
        svst1_f32(pg32,s,acc);
        s += ncols_interleaved;
    }
    return;
} 
#endif 
//mycode end

#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;

        for (int c = 0; c < nc; c += ncols_interleaved) {
            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
            float32x4_t acc = vdupq_n_f32(0);
            for (int b = 0; b < nb; b++) {
                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);

                int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
                int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
                int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
                int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);

                int32x4_t ret0 = vdupq_n_s32(0);
                int32x4_t ret1 = vdupq_n_s32(0);

                ret0 = vdotq_s32(ret0, b0 << 4, a0);
                ret1 = vdotq_s32(ret1, b1 << 4, a0);
                ret0 = vdotq_s32(ret0, b2 << 4, a1);
                ret1 = vdotq_s32(ret1, b3 << 4, a1);

                ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
                ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
                ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
                ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);

                int32x4_t ret = vpaddq_s32(ret0, ret1);

                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
                        vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
                a_ptr++;
                b_ptr++;
            }
            vst1q_f32(s, acc);
            s += ncols_interleaved;
        }
        return;
    }
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
    float sumf[4];
    int sumi;

    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
    for (int x = 0; x < nc / ncols_interleaved; x++) {
        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);

        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
        for (int l = 0; l < nb; l++) {
            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
                for (int j = 0; j < ncols_interleaved; j++) {
                    sumi = 0;
                    for (int i = 0; i < blocklen; ++i) {
                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
                    }
                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
                }
            }
        }
        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
    }
}

and this my llama-bench:

./llama-bench -m ../Qwen2.5-1.5B-Instruct-Q4_0.gguf  -p 0 -t 4,8 -fa 1 -n 64,128,512                                  
| model                          |       size |     params | backend    | threads | fa |          test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -: | ------------: | -------------------: |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       4 |  1 |          tg64 |         32.77 ± 0.03 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       4 |  1 |         tg128 |         32.62 ± 0.03 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       4 |  1 |         tg512 |         31.45 ± 0.01 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       8 |  1 |          tg64 |         34.72 ± 3.50 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       8 |  1 |         tg128 |         35.91 ± 0.23 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       8 |  1 |         tg512 |         34.55 ± 0.25 |

build: 7a84777f (5054)

and this is the upstream llama-bench:

./llama-bench -m ../Qwen2.5-1.5B-Instruct-Q4_0.gguf  -p 0 -t 4,8 -fa 1 -n 64,128,512                                  
| model                          |       size |     params | backend    | threads | fa |          test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -: | ------------: | -------------------: |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       4 |  1 |          tg64 |         35.33 ± 0.11 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       4 |  1 |         tg128 |         35.16 ± 0.03 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       4 |  1 |         tg512 |         33.99 ± 0.06 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       8 |  1 |          tg64 |         35.82 ± 3.71 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       8 |  1 |         tg128 |         37.05 ± 0.31 |
| qwen2 1.5B Q4_0                | 885.97 MiB |     1.54 B | CPU        |       8 |  1 |         tg512 |         35.12 ± 0.36 |

build: 7a84777f (5054)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Can anyone guide me on how to further optimize my SVE kernel? #12771

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 0 comments

Select a reply

Can anyone guide me on how to further optimize my SVE kernel? #12771

kuoihao Apr 5, 2025

🙏 Request for Help

🧩 System Information

Replies: 0 comments

kuoihao
Apr 5, 2025