Skip to content

Commit aab436c

Browse files
committed
ggml: Added run-time detection of neon, i8mm and sve
Adds run-time detection of the Arm instructions set features neon, i8mm and sve for Linux and Apple build targets.
1 parent d54c21d commit aab436c

File tree

5 files changed

+103
-34
lines changed

5 files changed

+103
-34
lines changed

ggml/include/ggml.h

+3
Original file line numberDiff line numberDiff line change
@@ -2479,6 +2479,9 @@ extern "C" {
24792479
GGML_API int ggml_cpu_has_cann (void);
24802480
GGML_API int ggml_cpu_has_llamafile (void);
24812481

2482+
// get the sve vector length in bytes
2483+
GGML_API int ggml_cpu_get_sve_cnt(void);
2484+
24822485
//
24832486
// Internal types and functions exposed for tests and benchmarks
24842487
//

ggml/src/ggml-aarch64.c

+14-14
Original file line numberDiff line numberDiff line change
@@ -546,8 +546,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
546546
UNUSED(blocklen);
547547

548548
#if defined(__ARM_FEATURE_SVE)
549-
if (ggml_sve_cnt_b == QK8_0) {
550-
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
549+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
550+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
551551
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
552552
}
553553
#endif
@@ -658,8 +658,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
658658
UNUSED(blocklen);
659659

660660
#if defined(__ARM_FEATURE_SVE)
661-
if (ggml_sve_cnt_b == QK8_0) {
662-
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
661+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
662+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
663663
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
664664
}
665665
#endif
@@ -776,7 +776,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
776776
UNUSED(blocklen);
777777

778778
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
779-
if (ggml_sve_cnt_b == QK8_0) {
779+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
780780
const void * b_ptr = vx;
781781
const void * a_ptr = vy;
782782
float * res_ptr = s;
@@ -842,12 +842,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
842842
return;
843843
}
844844
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
845-
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
845+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
846846
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
847847
"performance");
848848
}
849849
else if (ggml_cpu_has_neon()) {
850-
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
850+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
851851
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
852852
"quantization format for optimal performance");
853853
}
@@ -997,8 +997,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
997997
UNUSED(blocklen);
998998

999999
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1000-
if (ggml_sve_cnt_b == QK8_0) {
1001-
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
1000+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
1001+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
10021002
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
10031003
}
10041004
#endif
@@ -1518,8 +1518,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
15181518
UNUSED(blocklen);
15191519

15201520
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1521-
if (ggml_sve_cnt_b == QK8_0) {
1522-
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
1521+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
1522+
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
15231523
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
15241524
}
15251525
#endif
@@ -1980,7 +1980,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
19801980
UNUSED(blocklen);
19811981

19821982
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1983-
if (ggml_sve_cnt_b == QK8_0) {
1983+
if (ggml_cpu_get_sve_cnt() == QK8_0) {
19841984
const void * b_ptr = vx;
19851985
const void * a_ptr = vy;
19861986
float * res_ptr = s;
@@ -2391,12 +2391,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
23912391
return;
23922392
}
23932393
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2394-
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
2394+
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
23952395
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
23962396
"performance");
23972397
}
23982398
else if (ggml_cpu_has_neon()) {
2399-
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
2399+
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
24002400
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
24012401
"quantization format for optimal performance");
24022402
}

ggml/src/ggml-quants.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -4012,7 +4012,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
40124012
svfloat32_t sumv0 = svdup_n_f32(0.0f);
40134013
svfloat32_t sumv1 = svdup_n_f32(0.0f);
40144014

4015-
const int vector_length = ggml_sve_cnt_b*8;
4015+
const int vector_length = ggml_cpu_get_sve_cnt()*8;
40164016

40174017
// VLA Implementation using switch case
40184018
switch (vector_length) {
@@ -5596,7 +5596,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
55965596
svfloat32_t sumv0 = svdup_n_f32(0.0f);
55975597
svfloat32_t sumv1 = svdup_n_f32(0.0f);
55985598

5599-
const int vector_length = ggml_sve_cnt_b*8;
5599+
const int vector_length = ggml_cpu_get_sve_cnt()*8;
56005600

56015601
//VLA Implemenation for SVE
56025602
switch (vector_length) {

ggml/src/ggml-quants.h

-4
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type);
142142
void iq3xs_init_impl(int grid_size);
143143
void iq3xs_free_impl(int grid_size);
144144

145-
#if defined(__ARM_FEATURE_SVE)
146-
extern int ggml_sve_cnt_b;
147-
#endif
148-
149145
#ifdef __cplusplus
150146
}
151147
#endif

ggml/src/ggml.c

+84-14
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,15 @@
3737
#include <unistd.h>
3838
#endif
3939

40-
#if defined(__ARM_FEATURE_SVE)
41-
int ggml_sve_cnt_b = 0;
40+
#if defined(__aarch64__)
41+
struct ggml_aarch64_features_type {
42+
int has_neon;
43+
int has_i8mm;
44+
int has_sve;
45+
int sve_cnt;
46+
} ggml_aarch64_features = {-1, -1, -1, 0};
4247
#endif
48+
4349
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
4450
#undef GGML_USE_LLAMAFILE
4551
#endif
@@ -3643,6 +3649,65 @@ static inline int ggml_up(int n, int m) {
36433649

36443650
////////////////////////////////////////////////////////////////////////////////
36453651

3652+
#if defined(__aarch64__)
3653+
3654+
#if defined(__linux__)
3655+
#include <sys/auxv.h>
3656+
#elif defined(__APPLE__)
3657+
#include <sys/sysctl.h>
3658+
#endif
3659+
3660+
static void ggml_init_aarch64_features(void) {
3661+
#if defined(__linux__)
3662+
uint32_t hwcap = getauxval(AT_HWCAP);
3663+
uint32_t hwcap2 = getauxval(AT_HWCAP2);
3664+
3665+
ggml_aarch64_features.has_neon = !!(hwcap & HWCAP_ASIMD);
3666+
ggml_aarch64_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
3667+
ggml_aarch64_features.has_sve = !!(hwcap & HWCAP_SVE);
3668+
#if defined(__ARM_FEATURE_SVE)
3669+
ggml_aarch64_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3670+
#endif
3671+
#elif defined(__APPLE__)
3672+
int oldp = 0;
3673+
size_t size = sizeof(oldp);
3674+
if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
3675+
oldp = 0;
3676+
}
3677+
ggml_aarch64_features.has_neon = oldp;
3678+
3679+
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
3680+
oldp = 0;
3681+
}
3682+
ggml_aarch64_features.has_i8mm = oldp;
3683+
3684+
ggml_aarch64_features.has_sve = 0;
3685+
ggml_aarch64_features.sve_cnt = 0;
3686+
#else
3687+
// Run-time CPU feature detection not implemented for this platform, fallback to compile time
3688+
#if defined(__ARM_NEON)
3689+
ggml_aarch64_features.has_neon = 1;
3690+
#else
3691+
ggml_aarch64_features.has_neon = 0;
3692+
#endif
3693+
3694+
#if defined(__ARM_FEATURE_MATMUL_INT8)
3695+
ggml_aarch64_features.has_i8mm = 1;
3696+
#else
3697+
ggml_aarch64_features.has_i8mm = 0;
3698+
#endif
3699+
3700+
#if defined(__ARM_FEATURE_SVE)
3701+
ggml_aarch64_features.has_sve = 1;
3702+
ggml_aarch64_features.sve_cnt = 16;
3703+
#else
3704+
ggml_aarch64_features.has_sve = 0;
3705+
ggml_aarch64_features.sve_cnt = 0;
3706+
#endif
3707+
#endif
3708+
}
3709+
#endif
3710+
36463711
struct ggml_context * ggml_init(struct ggml_init_params params) {
36473712
// make this function thread safe
36483713
ggml_critical_section_start();
@@ -3693,6 +3758,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
36933758
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
36943759
}
36953760

3761+
#if defined(__aarch64__)
3762+
ggml_init_aarch64_features();
3763+
#endif
3764+
36963765
is_first_call = false;
36973766
}
36983767

@@ -3741,12 +3810,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
37413810

37423811
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
37433812

3744-
#if defined(__ARM_FEATURE_SVE)
3745-
if (!ggml_sve_cnt_b) {
3746-
ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3747-
}
3748-
#endif
3749-
37503813
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
37513814

37523815
ggml_critical_section_end();
@@ -23265,16 +23328,16 @@ int ggml_cpu_has_fma(void) {
2326523328
}
2326623329

2326723330
int ggml_cpu_has_neon(void) {
23268-
#if defined(__ARM_NEON)
23269-
return 1;
23331+
#if defined(__aarch64__)
23332+
return ggml_aarch64_features.has_neon;
2327023333
#else
2327123334
return 0;
2327223335
#endif
2327323336
}
2327423337

2327523338
int ggml_cpu_has_sve(void) {
23276-
#if defined(__ARM_FEATURE_SVE)
23277-
return 1;
23339+
#if defined(__aarch64__)
23340+
return ggml_aarch64_features.has_sve;
2327823341
#else
2327923342
return 0;
2328023343
#endif
@@ -23421,11 +23484,18 @@ int ggml_cpu_has_vsx(void) {
2342123484
}
2342223485

2342323486
int ggml_cpu_has_matmul_int8(void) {
23424-
#if defined(__ARM_FEATURE_MATMUL_INT8)
23425-
return 1;
23487+
#if defined(__aarch64__)
23488+
return ggml_aarch64_features.has_i8mm;
2342623489
#else
2342723490
return 0;
2342823491
#endif
2342923492
}
2343023493

23494+
int ggml_cpu_get_sve_cnt(void) {
23495+
#if defined(__aarch64__)
23496+
return ggml_aarch64_features.sve_cnt;
23497+
#else
23498+
return 0;
23499+
#endif
23500+
}
2343123501
////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)