Skip to content

Commit d23eb3b

Browse files
committed
Support for SME1 based sgemm_direct kernel for cblas_sgemm level 3 API
* Added ARMV9SME target * Added SGEMM_DIRECT kernel based on SME1
1 parent 18014b0 commit d23eb3b

26 files changed

+694
-36
lines changed

Diff for: CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44

55
cmake_minimum_required(VERSION 3.16.0)
66

7+
set (CMAKE_ASM_SOURCE_FILE_EXTENSIONS "S")
78
project(OpenBLAS C ASM)
89

910
set(OpenBLAS_MAJOR_VERSION 0)
1011
set(OpenBLAS_MINOR_VERSION 3)
11-
set(OpenBLAS_PATCH_VERSION 28.dev)
12+
set(OpenBLAS_PATCH_VERSION 29.dev)
1213

1314
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1415

Diff for: Makefile.arm64

+5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve
3030
endif
3131
endif
3232

33+
ifeq ($(CORE), ARMV9SME)
34+
CCOMMON_OPT += -march=armv9-a+sve2+sme
35+
FCOMMON_OPT += -march=armv9-a+sve2
36+
endif
37+
3338
ifeq ($(CORE), CORTEXA53)
3439
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
3540
ifneq ($(F_COMPILER), NAG)

Diff for: Makefile.system

+8
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64)
420420
export MACOSX_DEPLOYMENT_TARGET=11.0
421421
ifeq ($(C_COMPILER), GCC)
422422
export NO_SVE = 1
423+
export NO_SME = 1
423424
endif
424425
else
425426
export MACOSX_DEPLOYMENT_TARGET=10.8
@@ -709,6 +710,9 @@ DYNAMIC_CORE += NEOVERSEN2
709710
DYNAMIC_CORE += ARMV8SVE
710711
DYNAMIC_CORE += A64FX
711712
endif
713+
ifneq ($(NO_SME), 1)
714+
DYNAMIC_CORE += ARMV9SME
715+
endif
712716
DYNAMIC_CORE += THUNDERX
713717
DYNAMIC_CORE += THUNDERX2T99
714718
DYNAMIC_CORE += TSV110
@@ -1474,6 +1478,10 @@ ifeq ($(NO_SVE), 1)
14741478
CCOMMON_OPT += -DNO_SVE
14751479
endif
14761480

1481+
ifeq ($(NO_SME), 1)
1482+
CCOMMON_OPT += -DNO_SME
1483+
endif
1484+
14771485
ifdef SMP
14781486
CCOMMON_OPT += -DSMP_SERVER
14791487

Diff for: TargetList.txt

+1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ THUNDERX3T110
111111
VORTEX
112112
A64FX
113113
ARMV8SVE
114+
ARMV9SME
114115
FT2000
115116

116117
9.System Z:

Diff for: c_check

+19
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,24 @@ if [ "$architecture" = "arm64" ]; then
331331
rm -rf "$tmpd"
332332
fi
333333

334+
no_sme=0
335+
if [ "$architecture" = "arm64" ]; then
336+
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
337+
tmpf="$tmpd/a.S"
338+
printf ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n">> "$tmpf"
339+
args=" -march=armv9-a+sve2+sme -c -o $tmpf.o $tmpf"
340+
no_sme=0
341+
{
342+
$compiler_name $flags $args >/dev/null 2>&1
343+
} || {
344+
args=" -march=armv9-a+sme -c -o $tmpf.o $tmpf"
345+
$compiler_name $flags $args >/dev/null 2>&1
346+
} || {
347+
no_sme=1
348+
}
349+
rm -rf "$tmpd"
350+
fi
351+
334352
c11_atomics=0
335353
case "$data" in
336354
*HAVE_C11*)
@@ -472,6 +490,7 @@ done
472490
printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a"
473491
[ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n"
474492
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
493+
[ "$no_sme" -eq 1 ] && printf "NO_SME=1\n"
475494
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
476495
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
477496
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"

Diff for: cmake/arch.cmake

+15-3
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,21 @@ endif ()
4444

4545
if (DYNAMIC_ARCH)
4646
if (ARM64)
47-
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
48-
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
49-
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
47+
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
48+
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
49+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10
50+
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
51+
endif ()
52+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14
53+
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
54+
endif()
55+
elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang")
56+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
57+
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
58+
endif ()
59+
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19
60+
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
61+
endif()
5062
endif ()
5163
if (DYNAMIC_LIST)
5264
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})

Diff for: cmake/cc.cmake

+6
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE)
238238
endif ()
239239
endif ()
240240

241+
if (${CORE} STREQUAL ARMV9SME)
242+
if (NOT DYNAMIC_ARCH)
243+
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme")
244+
endif ()
245+
endif ()
246+
241247
if (${CORE} STREQUAL CORTEXA510)
242248
if (NOT DYNAMIC_ARCH)
243249
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")

Diff for: cmake/prebuild.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,7 @@ endif ()
10141014
set(ZGEMM_UNROLL_M 4)
10151015
set(ZGEMM_UNROLL_N 4)
10161016
set(SYMV_P 16)
1017-
elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
1017+
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME")
10181018
file(APPEND ${TARGET_CONF_TEMP}
10191019
"#define L1_CODE_SIZE\t65536\n"
10201020
"#define L1_CODE_LINESIZE\t64\n"

Diff for: cmake/system.cmake

+27-11
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,15 @@ endif()
2121
# Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet?
2222
# It seems we are meant to use TARGET as input and CORE internally as kernel.
2323
if(NOT DEFINED CORE AND DEFINED TARGET)
24-
set(CORE ${TARGET})
24+
if (${TARGET} STREQUAL "LOONGSON3R5")
25+
set(CORE "LA464")
26+
elseif (${TARGET} STREQUAL "LOONGSON2K1000")
27+
set(CORE "LA264")
28+
elseif (${TARGET} STREQUAL "LOONGSONGENERIC")
29+
set(CORE "LA64_GENERIC)")
30+
else ()
31+
set(CORE ${TARGET})
32+
endif()
2533
endif()
2634

2735
# TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
@@ -310,6 +318,9 @@ if (${TARGET} STREQUAL NEOVERSEV1)
310318
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
311319
endif()
312320
endif()
321+
if (${TARGET} STREQUAL ARMV9SME)
322+
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3")
323+
endif()
313324
if (${TARGET} STREQUAL A64FX)
314325
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
315326
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")
@@ -382,6 +393,8 @@ if (NEED_PIC)
382393
if (NOT NOFORTRAN)
383394
if (${F_COMPILER} STREQUAL "SUN")
384395
set(FCOMMON_OPT "${FCOMMON_OPT} -pic")
396+
elseif (${F_COMPILER} STREQUAL "NAGFOR")
397+
set(FCOMMON_OPT "${FCOMMON_OPT} -PIC")
385398
else ()
386399
set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC")
387400
endif ()
@@ -640,17 +653,17 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
640653
endif ()
641654

642655
if (CMAKE_Fortran_COMPILER)
643-
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
644-
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
645-
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
646-
message(STATUS "removing fortran flags")
647-
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
656+
if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
657+
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
658+
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
659+
message(STATUS "removing fortran flags")
660+
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
661+
endif ()
662+
foreach (FILTER_FLAG ${FILTER_FLAGS})
663+
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
664+
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
665+
endforeach ()
648666
endif ()
649-
foreach (FILTER_FLAG ${FILTER_FLAGS})
650-
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
651-
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
652-
endforeach ()
653-
endif ()
654667
endif ()
655668

656669
if ("${F_COMPILER}" STREQUAL "GFORTRAN")
@@ -670,6 +683,9 @@ endif ()
670683
if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
671684
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE")
672685
endif ()
686+
if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
687+
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE")
688+
endif ()
673689

674690
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
675691
if ("${F_COMPILER}" STREQUAL "FLANG")

Diff for: cmake/system_check.cmake

+11
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,17 @@ endif()
135135
endif()
136136
endif()
137137

138+
if (ARM64)
139+
if (NOT NO_SME)
140+
file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n")
141+
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME)
142+
if (NO_SME EQUAL 1)
143+
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME")
144+
endif()
145+
file(REMOVE "${PROJECT_BINARY_DIR}/sme.c" "${PROJECT_BINARY_DIR}/sme.o")
146+
endif()
147+
endif()
148+
138149
include(CheckIncludeFile)
139150
CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11)
140151
if (HAVE_C11 EQUAL 1)

Diff for: common.h

+1
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,7 @@ void gotoblas_profile_init(void);
696696
void gotoblas_profile_quit(void);
697697

698698
int support_avx512(void);
699+
int support_sme1(void);
699700

700701
#ifdef USE_OPENMP
701702

Diff for: common_arm64.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
175175
#define HUGE_PAGESIZE ( 4 << 20)
176176

177177
#ifndef BUFFERSIZE
178-
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
178+
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME)
179179
#define BUFFER_SIZE (32 << 22)
180180
#else
181181
#define BUFFER_SIZE (32 << 20)

Diff for: common_param.h

+6
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
221221
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
222222
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
223223
#endif
224+
#ifdef ARCH_ARM64
225+
#ifdef HAVE_SME
226+
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
227+
#endif
228+
#endif
229+
224230

225231
int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
226232
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);

Diff for: common_s.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -213,9 +213,9 @@
213213
#ifdef ARCH_X86_64
214214
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant
215215
#define SGEMM_DIRECT gotoblas -> sgemm_direct
216-
#else
216+
#elif ARCH_ARM64
217217
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant
218-
#define SGEMM_DIRECT sgemm_direct
218+
#define SGEMM_DIRECT gotoblas -> sgemm_direct
219219
#endif
220220

221221
#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy

Diff for: driver/others/dynamic_arm64.c

+34
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE;
115115
#else
116116
#define gotoblas_ARMV8SVE gotoblas_ARMV8
117117
#endif
118+
#ifdef DYN_ARMV9SME
119+
extern gotoblas_t gotoblas_ARMV9SME;
120+
#else
121+
#define gotoblas_ARMV9SME gotoblas_ARMV8
122+
#endif
118123
#ifdef DYN_CORTEX_A55
119124
extern gotoblas_t gotoblas_CORTEXA55;
120125
#else
@@ -148,6 +153,13 @@ extern gotoblas_t gotoblas_A64FX;
148153
#define gotoblas_ARMV8SVE gotoblas_ARMV8
149154
#define gotoblas_A64FX gotoblas_ARMV8
150155
#endif
156+
157+
#ifndef NO_SME
158+
extern gotoblas_t gotoblas_ARMV9SME;
159+
#else
160+
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
161+
#endif
162+
151163
extern gotoblas_t gotoblas_THUNDERX3T110;
152164
#endif
153165
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
@@ -168,6 +180,9 @@ extern void openblas_warning(int verbose, const char * msg);
168180
#ifndef HWCAP_SVE
169181
#define HWCAP_SVE (1 << 22)
170182
#endif
183+
#ifndef HWCAP2_SME
184+
#define HWCAP2_SME 1<<23
185+
#endif
171186

172187
#define get_cpu_ftr(id, var) ({ \
173188
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
@@ -393,6 +408,13 @@ static gotoblas_t *get_coretype(void) {
393408
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
394409
openblas_warning(1, coremsg);
395410
}
411+
412+
#if !defined(NO_SME) && defined(HWCAP2_SME)
413+
if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) {
414+
return &gotoblas_ARMV9SME;
415+
}
416+
#endif
417+
396418
#ifndef NO_SVE
397419
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
398420
return &gotoblas_ARMV8SVE;
@@ -443,3 +465,15 @@ void gotoblas_dynamic_init(void) {
443465
void gotoblas_dynamic_quit(void) {
444466
gotoblas = NULL;
445467
}
468+
469+
int support_sme1(void) {
470+
int ret = 0;
471+
472+
#if (defined OS_LINUX || defined OS_ANDROID)
473+
ret = getauxval(AT_HWCAP2) & HWCAP2_SME;
474+
if(getauxval(AT_HWCAP2) & HWCAP2_SME){
475+
ret = 1;
476+
}
477+
#endif
478+
return ret;
479+
}

Diff for: getarch.c

+13
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12891289
#define CORENAME "ARMV8SVE"
12901290
#endif
12911291

1292+
#ifdef FORCE_ARMV9SME
1293+
#define FORCE
1294+
#define ARCHITECTURE "ARM64"
1295+
#define SUBARCHITECTURE "ARMV9SME"
1296+
#define SUBDIRNAME "arm64"
1297+
#define ARCHCONFIG "-DARMV9SME " \
1298+
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
1299+
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
1300+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
1301+
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9"
1302+
#define LIBNAME "armv9sme"
1303+
#define CORENAME "ARMV9SME"
1304+
#endif
12921305

12931306
#ifdef FORCE_ARMV8
12941307
#define FORCE

0 commit comments

Comments
 (0)