[Clang][ARM] Make CRC and DSP intrinsics always available. #107417

DanielKristofKiss · 2024-09-05T15:55:14Z

Both feature has target feature so can be checked if the usage is valid.

llvmbot · 2024-09-05T15:55:48Z

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-clang

Author: Daniel Kiss (DanielKristofKiss)

Changes

Both feature has target feature so can be checked if the usage is valid.

Full diff: https://github.com/llvm/llvm-project/pull/107417.diff

2 Files Affected:

(modified) clang/lib/Headers/arm_acle.h (+18-21)
(modified) clang/test/CodeGen/arm_acle.c (+73-3)

diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 1518b0c4c8428f..b1dc90f84ad36f 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -264,28 +264,28 @@ __rbitl(unsigned long __t) {
 }
 
 /* 8.3 16-bit multiplications */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smulbb(int32_t __a, int32_t __b) {
   return __builtin_arm_smulbb(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smulbt(int32_t __a, int32_t __b) {
   return __builtin_arm_smulbt(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smultb(int32_t __a, int32_t __b) {
   return __builtin_arm_smultb(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smultt(int32_t __a, int32_t __b) {
   return __builtin_arm_smultt(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smulwb(int32_t __a, int32_t __b) {
   return __builtin_arm_smulwb(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smulwt(int32_t __a, int32_t __b) {
   return __builtin_arm_smulwt(__a, __b);
 }
@@ -304,46 +304,46 @@ __smulwt(int32_t __a, int32_t __b) {
 #endif
 
 /* 8.4.2 Saturating addition and subtraction intrinsics */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __qadd(int32_t __t, int32_t __v) {
   return __builtin_arm_qadd(__t, __v);
 }
 
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __qsub(int32_t __t, int32_t __v) {
   return __builtin_arm_qsub(__t, __v);
 }
 
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __qdbl(int32_t __t) {
   return __builtin_arm_qadd(__t, __t);
 }
 #endif
 
 /* 8.4.3 Accumulating multiplications */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlabb(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlabt(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlabt(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlatb(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlatb(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlatt(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlatt(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlawb(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlawb(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlawt(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlawt(__a, __b, __c);
 }
@@ -621,8 +621,6 @@ __rintnf(float __a) {
 #endif
 
 /* 8.8 CRC32 intrinsics */
-#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
-    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32b(uint32_t __a, uint8_t __b) {
   return __builtin_arm_crc32b(__a, __b);
@@ -662,7 +660,6 @@ static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target
 __crc32cd(uint32_t __a, uint64_t __b) {
   return __builtin_arm_crc32cd(__a, __b);
 }
-#endif
 
 /* 8.6 Floating-point data-processing intrinsics */
 /* Armv8.3-A Javascript conversion intrinsic */
diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c
index 1c41f1b5d23f0c..74de8246d7de6e 100644
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
@@ -1,4 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -ffreestanding -triple armv8a-none-eabi -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch32
 // RUN: %clang_cc1 -ffreestanding -triple armv8a-none-eabi -target-feature +crc -target-feature +dsp -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch32
 // RUN: %clang_cc1 -ffreestanding -Wno-error=implicit-function-declaration -triple aarch64-none-elf -target-feature +neon -target-feature +crc -target-feature +crypto -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64
 // RUN: %clang_cc1 -ffreestanding -triple aarch64-none-elf -target-feature +v8.3a -target-feature +crc -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch64,AArch6483
@@ -638,12 +639,15 @@ uint32_t test_usat(int32_t t) {
 #endif
 
 /* 9.4.2 Saturating addition and subtraction intrinsics */
-#ifdef __ARM_FEATURE_DSP
+#ifdef __ARM_32BIT_STATE
 // AArch32-LABEL: @test_qadd(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_qadd(int32_t a, int32_t b) {
   return __qadd(a, b);
 }
@@ -653,6 +657,9 @@ int32_t test_qadd(int32_t a, int32_t b) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qsub(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_qsub(int32_t a, int32_t b) {
   return __qsub(a, b);
 }
@@ -664,6 +671,9 @@ extern int32_t f();
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[CALL]], i32 [[CALL]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_qdbl() {
   return __qdbl(f());
 }
@@ -672,12 +682,15 @@ int32_t test_qdbl() {
 /*
  * 9.3 16-bit multiplications
  */
-#if __ARM_FEATURE_DSP
+#ifdef __ARM_32BIT_STATE
 // AArch32-LABEL: @test_smulbb(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulbb(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smulbb(int32_t a, int32_t b) {
   return __smulbb(a, b);
 }
@@ -687,6 +700,9 @@ int32_t test_smulbb(int32_t a, int32_t b) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulbt(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smulbt(int32_t a, int32_t b) {
   return __smulbt(a, b);
 }
@@ -696,6 +712,9 @@ int32_t test_smulbt(int32_t a, int32_t b) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smultb(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smultb(int32_t a, int32_t b) {
   return __smultb(a, b);
 }
@@ -705,6 +724,9 @@ int32_t test_smultb(int32_t a, int32_t b) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smultt(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smultt(int32_t a, int32_t b) {
   return __smultt(a, b);
 }
@@ -714,6 +736,9 @@ int32_t test_smultt(int32_t a, int32_t b) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulwb(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smulwb(int32_t a, int32_t b) {
   return __smulwb(a, b);
 }
@@ -723,18 +748,24 @@ int32_t test_smulwb(int32_t a, int32_t b) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smulwt(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smulwt(int32_t a, int32_t b) {
   return __smulwt(a, b);
 }
 #endif
 
 /* 9.4.3 Accumultating multiplications */
-#if __ARM_FEATURE_DSP
+#ifdef __ARM_32BIT_STATE
 // AArch32-LABEL: @test_smlabb(
 // AArch32-NEXT:  entry:
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlabb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smlabb(int32_t a, int32_t b, int32_t c) {
   return __smlabb(a, b, c);
 }
@@ -744,6 +775,9 @@ int32_t test_smlabb(int32_t a, int32_t b, int32_t c) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlabt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smlabt(int32_t a, int32_t b, int32_t c) {
   return __smlabt(a, b, c);
 }
@@ -753,6 +787,9 @@ int32_t test_smlabt(int32_t a, int32_t b, int32_t c) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlatb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smlatb(int32_t a, int32_t b, int32_t c) {
   return __smlatb(a, b, c);
 }
@@ -762,6 +799,9 @@ int32_t test_smlatb(int32_t a, int32_t b, int32_t c) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlatt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smlatt(int32_t a, int32_t b, int32_t c) {
   return __smlatt(a, b, c);
 }
@@ -771,6 +811,9 @@ int32_t test_smlatt(int32_t a, int32_t b, int32_t c) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlawb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smlawb(int32_t a, int32_t b, int32_t c) {
   return __smlawb(a, b, c);
 }
@@ -780,6 +823,9 @@ int32_t test_smlawb(int32_t a, int32_t b, int32_t c) {
 // AArch32-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.smlawt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
 // AArch32-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_DSP
+__attribute__((target("dsp")))
+#endif
 int32_t test_smlawt(int32_t a, int32_t b, int32_t c) {
   return __smlawt(a, b, c);
 }
@@ -1335,6 +1381,9 @@ int32_t test_smusdx(int16x2_t a, int16x2_t b) {
 // AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32b(i32 [[A:%.*]], i32 [[TMP0]])
 // AArch64-NEXT:    ret i32 [[TMP1]]
 //
+#ifndef __ARM_FEATURE_CRC32
+__attribute__((target("crc")))
+#endif
 uint32_t test_crc32b(uint32_t a, uint8_t b) {
   return __crc32b(a, b);
 }
@@ -1351,6 +1400,9 @@ uint32_t test_crc32b(uint32_t a, uint8_t b) {
 // AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32h(i32 [[A:%.*]], i32 [[TMP0]])
 // AArch64-NEXT:    ret i32 [[TMP1]]
 //
+#ifndef __ARM_FEATURE_CRC32
+__attribute__((target("crc")))
+#endif
 uint32_t test_crc32h(uint32_t a, uint16_t b) {
   return __crc32h(a, b);
 }
@@ -1365,6 +1417,9 @@ uint32_t test_crc32h(uint32_t a, uint16_t b) {
 // AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32w(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_CRC32
+__attribute__((target("crc")))
+#endif
 uint32_t test_crc32w(uint32_t a, uint32_t b) {
   return __crc32w(a, b);
 }
@@ -1383,6 +1438,9 @@ uint32_t test_crc32w(uint32_t a, uint32_t b) {
 // AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32x(i32 [[A:%.*]], i64 [[B:%.*]])
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_CRC32
+__attribute__((target("crc")))
+#endif
 uint32_t test_crc32d(uint32_t a, uint64_t b) {
   return __crc32d(a, b);
 }
@@ -1399,6 +1457,9 @@ uint32_t test_crc32d(uint32_t a, uint64_t b) {
 // AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32cb(i32 [[A:%.*]], i32 [[TMP0]])
 // AArch64-NEXT:    ret i32 [[TMP1]]
 //
+#ifndef __ARM_FEATURE_CRC32
+__attribute__((target("crc")))
+#endif
 uint32_t test_crc32cb(uint32_t a, uint8_t b) {
   return __crc32cb(a, b);
 }
@@ -1415,6 +1476,9 @@ uint32_t test_crc32cb(uint32_t a, uint8_t b) {
 // AArch64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32ch(i32 [[A:%.*]], i32 [[TMP0]])
 // AArch64-NEXT:    ret i32 [[TMP1]]
 //
+#ifndef __ARM_FEATURE_CRC32
+__attribute__((target("crc")))
+#endif
 uint32_t test_crc32ch(uint32_t a, uint16_t b) {
   return __crc32ch(a, b);
 }
@@ -1429,6 +1493,9 @@ uint32_t test_crc32ch(uint32_t a, uint16_t b) {
 // AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]])
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_CRC32
+__attribute__((target("crc")))
+#endif
 uint32_t test_crc32cw(uint32_t a, uint32_t b) {
   return __crc32cw(a, b);
 }
@@ -1447,6 +1514,9 @@ uint32_t test_crc32cw(uint32_t a, uint32_t b) {
 // AArch64-NEXT:    [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cx(i32 [[A:%.*]], i64 [[B:%.*]])
 // AArch64-NEXT:    ret i32 [[TMP0]]
 //
+#ifndef __ARM_FEATURE_CRC32
+__attribute__((target("crc")))
+#endif
 uint32_t test_crc32cd(uint32_t a, uint64_t b) {
   return __crc32cd(a, b);
 }

github-actions · 2024-09-05T15:59:17Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff 1693d8eb9aa94b0e8e2395234e6c63b57a2017b7 9dadc9bffc40e02dff9ef6a1d79968c8980892f4 --extensions c,h -- clang/lib/Headers/arm_acle.h clang/test/CodeGen/arm_acle.c

View the diff from clang-format here.

diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index b1dc90f84a..48801c38f1 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -265,28 +265,34 @@ __rbitl(unsigned long __t) {
 
 /* 8.3 16-bit multiplications */
 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
-__smulbb(int32_t __a, int32_t __b) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smulbb(int32_t __a, int32_t __b) {
   return __builtin_arm_smulbb(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
-__smulbt(int32_t __a, int32_t __b) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smulbt(int32_t __a, int32_t __b) {
   return __builtin_arm_smulbt(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
-__smultb(int32_t __a, int32_t __b) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smultb(int32_t __a, int32_t __b) {
   return __builtin_arm_smultb(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
-__smultt(int32_t __a, int32_t __b) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smultt(int32_t __a, int32_t __b) {
   return __builtin_arm_smultt(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
-__smulwb(int32_t __a, int32_t __b) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smulwb(int32_t __a, int32_t __b) {
   return __builtin_arm_smulwb(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
-__smulwt(int32_t __a, int32_t __b) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smulwt(int32_t __a, int32_t __b) {
   return __builtin_arm_smulwt(__a, __b);
 }
 #endif
@@ -305,46 +311,55 @@ __smulwt(int32_t __a, int32_t __b) {
 
 /* 8.4.2 Saturating addition and subtraction intrinsics */
 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
-__qadd(int32_t __t, int32_t __v) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __qadd(int32_t __t, int32_t __v) {
   return __builtin_arm_qadd(__t, __v);
 }
 
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
-__qsub(int32_t __t, int32_t __v) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __qsub(int32_t __t, int32_t __v) {
   return __builtin_arm_qsub(__t, __v);
 }
 
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
-__qdbl(int32_t __t) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __qdbl(int32_t __t) {
   return __builtin_arm_qadd(__t, __t);
 }
 #endif
 
 /* 8.4.3 Accumulating multiplications */
 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
-__smlabb(int32_t __a, int32_t __b, int32_t __c) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smlabb(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlabb(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
-__smlabt(int32_t __a, int32_t __b, int32_t __c) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smlabt(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlabt(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
-__smlatb(int32_t __a, int32_t __b, int32_t __c) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smlatb(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlatb(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
-__smlatt(int32_t __a, int32_t __b, int32_t __c) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smlatt(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlatt(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
-__smlawb(int32_t __a, int32_t __b, int32_t __c) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smlawb(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlawb(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
-__smlawt(int32_t __a, int32_t __b, int32_t __c) {
+static __inline__ int32_t
+    __attribute__((__always_inline__, __nodebug__, target("dsp")))
+    __smlawt(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlawt(__a, __b, __c);
 }
 #endif

DanielKristofKiss · 2024-09-06T15:13:53Z

Clang-format failure is expected as I kept intentionally the format to match with the rest of the file.

davemgreen

I think this is OK but do you know if the dsp side works with cortex-m? Target attributes are less likely to be used there, but it's worth testing if the command line args are still all happy.

DanielKristofKiss · 2024-09-13T08:50:34Z

I think this is OK but do you know if the dsp side works with cortex-m? Target attributes are less likely to be used there, but it's worth testing if the command line args are still all happy.

This seems works fine:

#include <arm_acle.h>
int dsp() {
    return __smlabb(1,3,4);
}

cat dpp.c | bin/clang dpp.c -O2 -target thumb-none-gnueabi -mcpu=cortex-m4 -c -o - | llvm-objdump -d -

<stdin>:        file format elf32-littlearm

Disassembly of section .text:

00000000 <dsp>:
       0: 2004          movs    r0, #0x4
       2: 2103          movs    r1, #0x3
       4: 2201          movs    r2, #0x1
       6: fb12 0001     smlabb  r0, r2, r1, r0
       a: 4770          bx      lr

could be added to the clang/test/Codegen/acle_test.c too. but we seem don't have too many M class test for these things...

// RUN: %clang_cc1 -ffreestanding -triple thumbv7em-unknown-none-gnueabi -target-feature -crc -target-feature +dsp -O0 -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s -check-prefixes=ARM,AArch32`
``

davemgreen

Thanks for checking - it was the other way around that I was also thinking about. That with -mcpu=cortex-m4+nodsp or -mcpu=cortex-m3 was giving sensible error messages. It looks like it is OK from what I can tell, so LGTM

llvm-ci · 2024-09-16T07:46:09Z

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building clang at step 10 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/5567

Here is the relevant piece of the build log for the reference

Step 10 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: sanitizer/kernel_crash_async.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa -O3 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa -O3 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# RUN: at line 3
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=TRACE
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=TRACE
# RUN: at line 4
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=CHECK
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/not --crash /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/sanitizer/Output/kernel_crash_async.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c --check-prefixes=CHECK
# .---command stderr------------
# | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c:39:11: error: CHECK: expected string not found in input
# | // CHECK: Kernel {{[0-9]}}: {{.*}} (__omp_offloading_{{.*}}_main_l29)
# |           ^
# | <stdin>:1:1: note: scanning from here
# | Display only launched kernel:
# | ^
# | <stdin>:2:16: note: possible intended match here
# | Kernel 'omp target in main @ 29 (__omp_offloading_802_d8283c2_main_l29)'
# |                ^
# | 
# | Input file: <stdin>
# | Check file: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/sanitizer/kernel_crash_async.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Display only launched kernel: 
# | check:39'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |             2: Kernel 'omp target in main @ 29 (__omp_offloading_802_d8283c2_main_l29)' 
# | check:39'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | check:39'1                    ?                                                          possible intended match
# |             3: OFFLOAD ERROR: Memory access fault by GPU 1 (agent 0x55daf17e06e0) at virtual address (nil). Reasons: Page not present or supervisor privilege, Write access to a read-only page 
# | check:39'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |             4: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations 
# | check:39'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |             5: error: Aborted 
# | check:39'0     ~~~~~~~~~~~~~~~
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--
...

[Clang][ARM] Make CRC and DSP intrinsics always available.

9dadc9b

Both feature has target feature so can be checked if the usage is valid.

DanielKristofKiss requested review from ddcc, davemgreen and enh-google September 5, 2024 15:55

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics labels Sep 5, 2024

davemgreen reviewed Sep 12, 2024

View reviewed changes

davemgreen approved these changes Sep 13, 2024

View reviewed changes

DanielKristofKiss merged commit cf2122c into llvm:main Sep 16, 2024
11 of 12 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Clang][ARM] Make CRC and DSP intrinsics always available. #107417

[Clang][ARM] Make CRC and DSP intrinsics always available. #107417

DanielKristofKiss commented Sep 5, 2024

llvmbot commented Sep 5, 2024 •

edited

Loading

github-actions bot commented Sep 5, 2024

DanielKristofKiss commented Sep 6, 2024

davemgreen left a comment

DanielKristofKiss commented Sep 13, 2024

davemgreen left a comment

llvm-ci commented Sep 16, 2024

[Clang][ARM] Make CRC and DSP intrinsics always available. #107417

[Clang][ARM] Make CRC and DSP intrinsics always available. #107417

Conversation

DanielKristofKiss commented Sep 5, 2024

llvmbot commented Sep 5, 2024 • edited Loading

github-actions bot commented Sep 5, 2024

DanielKristofKiss commented Sep 6, 2024

davemgreen left a comment

Choose a reason for hiding this comment

DanielKristofKiss commented Sep 13, 2024

davemgreen left a comment

Choose a reason for hiding this comment

llvm-ci commented Sep 16, 2024

llvmbot commented Sep 5, 2024 •

edited

Loading