@@ -3385,3 +3385,147 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
3385
3385
}
3386
3386
}
3387
3387
}
3388
+
3389
+ // FIXME: this code is duplicated from ggml-aarch64.c
3390
+ static block_q4_0x4 make_block_q4_0x4 (block_q4_0 * in , unsigned int blck_size_interleave , unsigned int xor_mask ) {
3391
+ block_q4_0x4 out ;
3392
+
3393
+ for (int i = 0 ; i < 4 ; i ++ ) {
3394
+ out .d [i ] = in [i ].d ;
3395
+ }
3396
+
3397
+ for (int i = 0 ; i < QK4_0 * 2 ; i ++ ) {
3398
+ int src_offset = (i / (4 * blck_size_interleave )) * blck_size_interleave ;
3399
+ int src_id = (i % (4 * blck_size_interleave )) / blck_size_interleave ;
3400
+ src_offset += (i % blck_size_interleave );
3401
+
3402
+ out .qs [i ] = in [src_id ].qs [src_offset ] ^ xor_mask ;
3403
+ }
3404
+
3405
+ return out ;
3406
+ }
3407
+
3408
+ // interleave 8 block_q4_0s in blocks of blck_size_interleave
3409
+ // returns an interleaved block_q4_0x8
3410
+ // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
3411
+ // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
3412
+ static block_q4_0x8 make_block_q4_0x8 (block_q4_0 * in , unsigned int blck_size_interleave , unsigned int xor_mask ) {
3413
+ block_q4_0x8 out ;
3414
+
3415
+ for (int i = 0 ; i < 8 ; i ++ ) {
3416
+ out .d [i ] = in [i ].d ;
3417
+ }
3418
+
3419
+ for (int i = 0 ; i < QK4_0 * 4 ; i ++ ) {
3420
+ int src_offset = (i / (8 * blck_size_interleave )) * blck_size_interleave ;
3421
+ int src_id = (i % (8 * blck_size_interleave )) / blck_size_interleave ;
3422
+ src_offset += (i % blck_size_interleave );
3423
+
3424
+ out .qs [i ] = in [src_id ].qs [src_offset ] ^ xor_mask ;
3425
+ }
3426
+
3427
+ return out ;
3428
+ }
3429
+
3430
+ static int repack_q4_0_to_q4_0_4_bl (struct ggml_tensor * t , int interleave_block , const void * restrict data , size_t data_size ) {
3431
+ GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
3432
+ GGML_ASSERT (interleave_block == 4 || interleave_block == 8 );
3433
+
3434
+ block_q4_0x4 * dst = (block_q4_0x4 * )t -> data ;
3435
+ const block_q4_0 * src = (const block_q4_0 * )data ;
3436
+ block_q4_0 dst_tmp [4 ];
3437
+ int nrow = t -> ne [1 ]; // Number of rows
3438
+ int nrows_interleaved = 4 ;
3439
+ int nblocks = t -> ne [0 ] / QK4_0 ;
3440
+
3441
+ GGML_ASSERT (data_size == nrow * nblocks * sizeof (block_q4_0 ));
3442
+
3443
+ if (nrow % nrows_interleaved != 0 || t -> ne [0 ] % 8 != 0 ) {
3444
+ return -1 ;
3445
+ }
3446
+
3447
+ for (int b = 0 ; b < nrow ; b += nrows_interleaved ) {
3448
+ for (int64_t x = 0 ; x < nblocks ; x ++ ) {
3449
+ for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3450
+ dst_tmp [i ] = src [x + i * nblocks ];
3451
+ }
3452
+ * dst ++ = make_block_q4_0x4 (dst_tmp , interleave_block , 0x88 );
3453
+ }
3454
+ src += nrows_interleaved * nblocks ;
3455
+ }
3456
+ return 0 ;
3457
+
3458
+ GGML_UNUSED (data_size );
3459
+ }
3460
+
3461
+ static int repack_q4_0_to_q4_0_8_bl (struct ggml_tensor * t , int interleave_block , const void * restrict data , size_t data_size ) {
3462
+ GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
3463
+ GGML_ASSERT (interleave_block == 8 );
3464
+
3465
+ block_q4_0x8 * dst = (block_q4_0x8 * )t -> data ;
3466
+ const block_q4_0 * src = (const block_q4_0 * ) data ;
3467
+ block_q4_0 dst_tmp [8 ];
3468
+ int nrow = t -> ne [1 ]; // Number of rows
3469
+ int nrows_interleaved = 8 ;
3470
+ int nblocks = t -> ne [0 ] / QK4_0 ;
3471
+
3472
+ GGML_ASSERT (data_size == nrow * nblocks * sizeof (block_q4_0 ));
3473
+
3474
+ if (nrow % nrows_interleaved != 0 || t -> ne [0 ] % 8 != 0 ) {
3475
+ return -1 ;
3476
+ }
3477
+
3478
+ for (int b = 0 ; b < nrow ; b += nrows_interleaved ) {
3479
+ for (int64_t x = 0 ; x < nblocks ; x ++ ) {
3480
+ for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3481
+ dst_tmp [i ] = src [x + i * nblocks ];
3482
+ }
3483
+ * dst ++ = make_block_q4_0x8 (dst_tmp , interleave_block , 0x88 );
3484
+ }
3485
+ src += nrows_interleaved * nblocks ;
3486
+ }
3487
+ return 0 ;
3488
+
3489
+ GGML_UNUSED (data_size );
3490
+ }
3491
+
3492
+ // Prepare for optimized kernels if applicable
3493
+ void ggml_aarch64_repack_tensor (struct ggml_tensor * cur , enum ggml_type repack_type , const void * restrict data , size_t data_size ) {
3494
+ if (cur -> type == repack_type ) {
3495
+ memcpy (cur -> data , data , data_size );
3496
+ return ;
3497
+ }
3498
+
3499
+ GGML_ASSERT (cur -> type == GGML_TYPE_Q4_0 );
3500
+
3501
+ switch (repack_type ) {
3502
+ case GGML_TYPE_Q4_0_8_8 :
3503
+ repack_q4_0_to_q4_0_8_bl (cur , 8 , data , data_size );
3504
+ break ;
3505
+ case GGML_TYPE_Q4_0_4_8 :
3506
+ repack_q4_0_to_q4_0_4_bl (cur , 8 , data , data_size );
3507
+ break ;
3508
+ case GGML_TYPE_Q4_0_4_4 :
3509
+ repack_q4_0_to_q4_0_4_bl (cur , 4 , data , data_size );
3510
+ break ;
3511
+ default :
3512
+ GGML_ABORT ("Unsupported type" );
3513
+ }
3514
+ }
3515
+
3516
+ enum ggml_type ggml_aarch64_get_optimal_repack_type (const struct ggml_tensor * cur ) {
3517
+ if (cur -> type == GGML_TYPE_Q4_0 ) {
3518
+ // TODO: enable for AVX2 - currently disabled due to bad gemv performance
3519
+ if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve () && ggml_cpu_has_matmul_int8 () && ggml_cpu_get_sve_cnt () == QK8_0 )) {
3520
+ return GGML_TYPE_Q4_0_8_8 ;
3521
+ }
3522
+ if (ggml_cpu_has_neon () && ggml_cpu_has_matmul_int8 ()) {
3523
+ return GGML_TYPE_Q4_0_4_8 ;
3524
+ }
3525
+ if (ggml_cpu_has_neon ()) {
3526
+ return GGML_TYPE_Q4_0_4_4 ;
3527
+ }
3528
+ }
3529
+
3530
+ return cur -> type ;
3531
+ }
0 commit comments