Skip to content

Commit a99b65f

Browse files
authored
[CIR][CIRGen][Builtin][Neon] Lower vget_lane_bf16, vduph_lane f16 and bf16 (#1360)
Lower vget_lane_bf16, vduph_lane f16 and bf16
1 parent 19e5076 commit a99b65f

File tree

3 files changed

+678
-1
lines changed

3 files changed

+678
-1
lines changed

clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3889,7 +3889,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
38893889
case NEON::BI__builtin_neon_vget_lane_bf16:
38903890
case NEON::BI__builtin_neon_vduph_lane_bf16:
38913891
case NEON::BI__builtin_neon_vduph_lane_f16: {
3892-
llvm_unreachable("NEON::BI__builtin_neon_vduph_lane_f16 NYI");
3892+
return builder.create<cir::VecExtractOp>(getLoc(E->getExprLoc()), Ops[0],
3893+
emitScalarExpr(E->getArg(1)));
38933894
}
38943895
case NEON::BI__builtin_neon_vgetq_lane_bf16:
38953896
case NEON::BI__builtin_neon_vduph_laneq_bf16:
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +bf16 \
2+
// RUN: -fclangir -disable-O0-optnone \
3+
// RUN: -flax-vector-conversions=none -emit-cir -o %t.cir %s
4+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
5+
6+
// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +bf16 \
7+
// RUN: -fclangir -disable-O0-optnone \
8+
// RUN: -flax-vector-conversions=none -emit-llvm -fno-clangir-call-conv-lowering -o - %s \
9+
// RUN: | opt -S -passes=mem2reg,simplifycfg -o %t.ll
10+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
11+
12+
// REQUIRES: aarch64-registered-target || arm-registered-target
13+
14+
// This test mimics clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c, which eventually
15+
// CIR shall be able to support fully. Since this is going to take some time to converge,
16+
// the unsupported/NYI code is commented out, so that we can incrementally improve this.
17+
// The NYI filecheck used contains the LLVM output from OG codegen that should guide the
18+
// correct result when implementing this into the CIR pipeline.
19+
20+
#include <arm_neon.h>
21+
22+
// CHECK-LABEL: @test_vcreate_bf16(
23+
// CHECK-NEXT: entry:
24+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A:%.*]] to <4 x bfloat>
25+
// CHECK-NEXT: ret <4 x bfloat> [[TMP0]]
26+
//
27+
// bfloat16x4_t test_vcreate_bf16(uint64_t a) {
28+
// return vcreate_bf16(a);
29+
// }
30+
31+
// CHECK-LABEL: @test_vdup_n_bf16(
32+
// CHECK-NEXT: entry:
33+
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i32 0
34+
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1
35+
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2
36+
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3
37+
// CHECK-NEXT: ret <4 x bfloat> [[VECINIT3_I]]
38+
//
39+
// bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) {
40+
// return vdup_n_bf16(v);
41+
// }
42+
43+
// CHECK-LABEL: @test_vdupq_n_bf16(
44+
// CHECK-NEXT: entry:
45+
// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i32 0
46+
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1
47+
// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2
48+
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3
49+
// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x bfloat> [[VECINIT3_I]], bfloat [[V]], i32 4
50+
// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x bfloat> [[VECINIT4_I]], bfloat [[V]], i32 5
51+
// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x bfloat> [[VECINIT5_I]], bfloat [[V]], i32 6
52+
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x bfloat> [[VECINIT6_I]], bfloat [[V]], i32 7
53+
// CHECK-NEXT: ret <8 x bfloat> [[VECINIT7_I]]
54+
//
55+
// bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) {
56+
// return vdupq_n_bf16(v);
57+
// }
58+
59+
// CHECK-LABEL: @test_vdup_lane_bf16(
60+
// CHECK-NEXT: entry:
61+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8>
62+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat>
63+
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
64+
// CHECK-NEXT: ret <4 x bfloat> [[LANE]]
65+
//
66+
// bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) {
67+
// return vdup_lane_bf16(v, 1);
68+
// }
69+
70+
// CHECK-LABEL: @test_vdupq_lane_bf16(
71+
// CHECK-NEXT: entry:
72+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8>
73+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat>
74+
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
75+
// CHECK-NEXT: ret <8 x bfloat> [[LANE]]
76+
//
77+
// bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) {
78+
// return vdupq_lane_bf16(v, 1);
79+
// }
80+
81+
// CHECK-LABEL: @test_vdup_laneq_bf16(
82+
// CHECK-NEXT: entry:
83+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8>
84+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat>
85+
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
86+
// CHECK-NEXT: ret <4 x bfloat> [[LANE]]
87+
//
88+
// bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) {
89+
// return vdup_laneq_bf16(v, 7);
90+
// }
91+
92+
// CHECK-LABEL: @test_vdupq_laneq_bf16(
93+
// CHECK-NEXT: entry:
94+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8>
95+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat>
96+
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
97+
// CHECK-NEXT: ret <8 x bfloat> [[LANE]]
98+
//
99+
// bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) {
100+
// return vdupq_laneq_bf16(v, 7);
101+
// }
102+
103+
// CHECK-LABEL: @test_vcombine_bf16(
104+
// CHECK-NEXT: entry:
105+
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[LOW:%.*]], <4 x bfloat> [[HIGH:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
106+
// CHECK-NEXT: ret <8 x bfloat> [[SHUFFLE_I]]
107+
//
108+
// bfloat16x8_t test_vcombine_bf16(bfloat16x4_t low, bfloat16x4_t high) {
109+
// return vcombine_bf16(low, high);
110+
// }
111+
112+
// CHECK-LABEL: @test_vget_high_bf16(
113+
// CHECK-NEXT: entry:
114+
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
115+
// CHECK-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
116+
//
117+
// bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) {
118+
// return vget_high_bf16(a);
119+
// }
120+
121+
// CHECK-LABEL: @test_vget_low_bf16(
122+
// CHECK-NEXT: entry:
123+
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
124+
// CHECK-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
125+
//
126+
// bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) {
127+
// return vget_low_bf16(a);
128+
// }
129+
130+
bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) {
131+
return vget_lane_bf16(v, 1);
132+
133+
// CIR-LABEL: vget_lane_bf16
134+
// CIR: [[TMP0:%.*]] = cir.const #cir.int<1> : !s32i
135+
// CIR: [[TMP1:%.*]] = cir.vec.extract {{.*}}[{{.*}} : !s32i] : !cir.vector<!cir.bf16 x 4>
136+
137+
// LLVM-LABEL: test_vget_lane_bf16
138+
// LLVM-SAME: (<4 x bfloat> [[VEC:%.*]])
139+
// LLVM: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[VEC]], i32 1
140+
// LLVM: ret bfloat [[VGET_LANE]]
141+
}
142+
143+
// CHECK-LABEL: @test_vgetq_lane_bf16(
144+
// CHECK-NEXT: entry:
145+
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7
146+
// CHECK-NEXT: ret bfloat [[VGETQ_LANE]]
147+
//
148+
// bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) {
149+
// return vgetq_lane_bf16(v, 7);
150+
// }
151+
152+
// CHECK-LABEL: @test_vset_lane_bf16(
153+
// CHECK-NEXT: entry:
154+
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 1
155+
// CHECK-NEXT: ret <4 x bfloat> [[VSET_LANE]]
156+
//
157+
// bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) {
158+
// return vset_lane_bf16(a, v, 1);
159+
// }
160+
161+
// CHECK-LABEL: @test_vsetq_lane_bf16(
162+
// CHECK-NEXT: entry:
163+
// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 7
164+
// CHECK-NEXT: ret <8 x bfloat> [[VSET_LANE]]
165+
//
166+
// bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) {
167+
// return vsetq_lane_bf16(a, v, 7);
168+
// }
169+
170+
bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) {
171+
return vduph_lane_bf16(v, 1);
172+
173+
// CIR-LABEL: vduph_lane_bf16
174+
// CIR: [[TMP0:%.*]] = cir.const #cir.int<1> : !s32i
175+
// CIR: [[TMP1:%.*]] = cir.vec.extract {{.*}}[{{.*}} : !s32i] : !cir.vector<!cir.bf16 x 4>
176+
177+
// LLVM-LABEL: test_vduph_lane_bf16
178+
// LLVM-SAME: (<4 x bfloat> [[VEC:%.*]])
179+
// LLVM: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[VEC]], i32 1
180+
// LLVM: ret bfloat [[VGET_LANE]]
181+
}

0 commit comments

Comments
 (0)