Skip to content

Commit 3c48621

Browse files
eddnjjnarthw
authored andcommitted
ggml : add run-time detection of neon, i8mm and sve (ggml-org#9331)
* ggml: Added run-time detection of neon, i8mm and sve Adds run-time detection of the Arm instructions set features neon, i8mm and sve for Linux and Apple build targets. * ggml: Extend feature detection to include non aarch64 Arm arch * ggml: Move definition of ggml_arm_arch_features to the global data section
1 parent 514091f commit 3c48621

File tree

5 files changed

+93
-32
lines changed

5 files changed

+93
-32
lines changed

ggml/include/ggml.h

+3
Original file line numberDiff line numberDiff line change
@@ -2507,6 +2507,9 @@ extern "C" {
25072507
GGML_API int ggml_cpu_has_cann (void);
25082508
GGML_API int ggml_cpu_has_llamafile (void);
25092509

2510+
// get the sve vector length in bytes
2511+
GGML_API int ggml_cpu_get_sve_cnt(void);
2512+
25102513
//
25112514
// Internal types and functions exposed for tests and benchmarks
25122515
//

ggml/src/ggml-aarch64.c

+2-11
Original file line numberDiff line numberDiff line change
@@ -598,15 +598,6 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
598598
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
599599
}
600600

601-
// Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
602-
static int sve_lane_count(void) {
603-
#if defined(__ARM_FEATURE_SVE)
604-
return ggml_sve_cnt_b;
605-
#else
606-
return 0;
607-
#endif
608-
}
609-
610601
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
611602
const int qk = QK8_0;
612603
const int nb = n / qk;
@@ -843,7 +834,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
843834

844835
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
845836
#if defined(__ARM_FEATURE_SVE)
846-
if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) {
837+
if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
847838
const void * b_ptr = vx;
848839
const void * a_ptr = vy;
849840
float * res_ptr = s;
@@ -2020,7 +2011,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
20202011

20212012
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
20222013
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2023-
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) {
2014+
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
20242015
const void * b_ptr = vx;
20252016
const void * a_ptr = vy;
20262017
float * res_ptr = s;

ggml/src/ggml-quants.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -4013,7 +4013,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
40134013
svfloat32_t sumv0 = svdup_n_f32(0.0f);
40144014
svfloat32_t sumv1 = svdup_n_f32(0.0f);
40154015

4016-
const int vector_length = ggml_sve_cnt_b*8;
4016+
const int vector_length = ggml_cpu_get_sve_cnt()*8;
40174017

40184018
// VLA Implementation using switch case
40194019
switch (vector_length) {
@@ -5597,7 +5597,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
55975597
svfloat32_t sumv0 = svdup_n_f32(0.0f);
55985598
svfloat32_t sumv1 = svdup_n_f32(0.0f);
55995599

5600-
const int vector_length = ggml_sve_cnt_b*8;
5600+
const int vector_length = ggml_cpu_get_sve_cnt()*8;
56015601

56025602
//VLA Implemenation for SVE
56035603
switch (vector_length) {

ggml/src/ggml-quants.h

-4
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type);
142142
void iq3xs_init_impl(int grid_size);
143143
void iq3xs_free_impl(int grid_size);
144144

145-
#if defined(__ARM_FEATURE_SVE)
146-
extern int ggml_sve_cnt_b;
147-
#endif
148-
149145
#ifdef __cplusplus
150146
}
151147
#endif

ggml/src/ggml.c

+86-15
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,6 @@
3939
#include <unistd.h>
4040
#endif
4141

42-
#if defined(__ARM_FEATURE_SVE)
43-
int ggml_sve_cnt_b = 0;
44-
#endif
4542
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
4643
#undef GGML_USE_LLAMAFILE
4744
#endif
@@ -455,6 +452,15 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
455452
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
456453
float ggml_table_f32_f16[1 << 16];
457454

455+
#if defined(__ARM_ARCH)
456+
struct ggml_arm_arch_features_type {
457+
int has_neon;
458+
int has_i8mm;
459+
int has_sve;
460+
int sve_cnt;
461+
} ggml_arm_arch_features = {-1, -1, -1, 0};
462+
#endif
463+
458464
GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
459465
switch (status) {
460466
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
@@ -3673,6 +3679,66 @@ static inline int ggml_up(int n, int m) {
36733679

36743680
////////////////////////////////////////////////////////////////////////////////
36753681

3682+
#if defined(__ARM_ARCH)
3683+
3684+
#if defined(__linux__) && defined(__aarch64__)
3685+
#include <sys/auxv.h>
3686+
#elif defined(__APPLE__)
3687+
#include <sys/sysctl.h>
3688+
#endif
3689+
3690+
static void ggml_init_arm_arch_features(void) {
3691+
#if defined(__linux__) && defined(__aarch64__)
3692+
uint32_t hwcap = getauxval(AT_HWCAP);
3693+
uint32_t hwcap2 = getauxval(AT_HWCAP2);
3694+
3695+
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
3696+
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
3697+
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
3698+
3699+
#if defined(__ARM_FEATURE_SVE)
3700+
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3701+
#endif
3702+
#elif defined(__APPLE__)
3703+
int oldp = 0;
3704+
size_t size = sizeof(oldp);
3705+
if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
3706+
oldp = 0;
3707+
}
3708+
ggml_arm_arch_features.has_neon = oldp;
3709+
3710+
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
3711+
oldp = 0;
3712+
}
3713+
ggml_arm_arch_features.has_i8mm = oldp;
3714+
3715+
ggml_arm_arch_features.has_sve = 0;
3716+
ggml_arm_arch_features.sve_cnt = 0;
3717+
#else
3718+
// Run-time CPU feature detection not implemented for this platform, fallback to compile time
3719+
#if defined(__ARM_NEON)
3720+
ggml_arm_arch_features.has_neon = 1;
3721+
#else
3722+
ggml_arm_arch_features.has_neon = 0;
3723+
#endif
3724+
3725+
#if defined(__ARM_FEATURE_MATMUL_INT8)
3726+
ggml_arm_arch_features.has_i8mm = 1;
3727+
#else
3728+
ggml_arm_arch_features.has_i8mm = 0;
3729+
#endif
3730+
3731+
#if defined(__ARM_FEATURE_SVE)
3732+
ggml_arm_arch_features.has_sve = 1;
3733+
ggml_arm_arch_features.sve_cnt = 16;
3734+
#else
3735+
ggml_arm_arch_features.has_sve = 0;
3736+
ggml_arm_arch_features.sve_cnt = 0;
3737+
#endif
3738+
#endif
3739+
}
3740+
#endif
3741+
36763742
struct ggml_context * ggml_init(struct ggml_init_params params) {
36773743
// make this function thread safe
36783744
ggml_critical_section_start();
@@ -3723,6 +3789,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
37233789
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
37243790
}
37253791

3792+
#if defined(__ARM_ARCH)
3793+
ggml_init_arm_arch_features();
3794+
#endif
3795+
37263796
is_first_call = false;
37273797
}
37283798

@@ -3771,12 +3841,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
37713841

37723842
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
37733843

3774-
#if defined(__ARM_FEATURE_SVE)
3775-
if (!ggml_sve_cnt_b) {
3776-
ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3777-
}
3778-
#endif
3779-
37803844
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
37813845

37823846
ggml_critical_section_end();
@@ -23578,16 +23642,16 @@ int ggml_cpu_has_fma(void) {
2357823642
}
2357923643

2358023644
int ggml_cpu_has_neon(void) {
23581-
#if defined(__ARM_NEON)
23582-
return 1;
23645+
#if defined(__ARM_ARCH)
23646+
return ggml_arm_arch_features.has_neon;
2358323647
#else
2358423648
return 0;
2358523649
#endif
2358623650
}
2358723651

2358823652
int ggml_cpu_has_sve(void) {
23589-
#if defined(__ARM_FEATURE_SVE)
23590-
return 1;
23653+
#if defined(__ARM_ARCH)
23654+
return ggml_arm_arch_features.has_sve;
2359123655
#else
2359223656
return 0;
2359323657
#endif
@@ -23734,11 +23798,18 @@ int ggml_cpu_has_vsx(void) {
2373423798
}
2373523799

2373623800
int ggml_cpu_has_matmul_int8(void) {
23737-
#if defined(__ARM_FEATURE_MATMUL_INT8)
23738-
return 1;
23801+
#if defined(__ARM_ARCH)
23802+
return ggml_arm_arch_features.has_i8mm;
2373923803
#else
2374023804
return 0;
2374123805
#endif
2374223806
}
2374323807

23808+
int ggml_cpu_get_sve_cnt(void) {
23809+
#if defined(__ARM_ARCH)
23810+
return ggml_arm_arch_features.sve_cnt;
23811+
#else
23812+
return 0;
23813+
#endif
23814+
}
2374423815
////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)