Skip to content

Commit f7b5c25

Browse files
authored
[AArch64][SME] Remove immediate argument restriction for svldr and svstr (#68565)
The svldr_vnum and svstr_vnum builtins always modify the base register and tile slice and provide immediate offsets of zero, even when the offset provided to the builtin is an immediate. This patch optimises the output of the builtins when the offset is an immediate, to pass it directly to the instruction and to not need the base register and tile slice updates.
1 parent bfd3734 commit f7b5c25

File tree

11 files changed

+569
-169
lines changed

11 files changed

+569
-169
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9886,18 +9886,10 @@ Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags,
98869886
Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags,
98879887
SmallVectorImpl<Value *> &Ops,
98889888
unsigned IntID) {
9889-
if (Ops.size() == 3) {
9890-
Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
9891-
llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb");
9892-
9893-
llvm::Value *VecNum = Ops[2];
9894-
llvm::Value *MulVL = Builder.CreateMul(CntsbCall, VecNum, "mulvl");
9895-
9896-
Ops[1] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL);
9897-
Ops[0] = Builder.CreateAdd(
9898-
Ops[0], Builder.CreateIntCast(VecNum, Int32Ty, true), "tileslice");
9899-
Ops.erase(&Ops[2]);
9900-
}
9889+
if (Ops.size() == 2)
9890+
Ops.push_back(Builder.getInt32(0));
9891+
else
9892+
Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true);
99019893
Function *F = CGM.getIntrinsic(IntID, {});
99029894
return Builder.CreateCall(F, Ops);
99039895
}

clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c

Lines changed: 31 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -6,86 +6,53 @@
66

77
#include <arm_sme_draft_spec_subject_to_change.h>
88

9-
// CHECK-C-LABEL: define dso_local void @test_svldr_vnum_za(
10-
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
11-
// CHECK-C-NEXT: entry:
12-
// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]])
13-
// CHECK-C-NEXT: ret void
14-
//
15-
// CHECK-CXX-LABEL: define dso_local void @_Z18test_svldr_vnum_zajPKv(
16-
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
17-
// CHECK-CXX-NEXT: entry:
18-
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]])
19-
// CHECK-CXX-NEXT: ret void
9+
// CHECK-C-LABEL: @test_svldr_vnum_za(
10+
// CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv(
11+
// CHECK-NEXT: entry:
12+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
13+
// CHECK-NEXT: ret void
2014
//
2115
void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) {
2216
svldr_vnum_za(slice_base, ptr, 0);
2317
}
2418

25-
// CHECK-C-LABEL: define dso_local void @test_svldr_vnum_za_1(
26-
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
27-
// CHECK-C-NEXT: entry:
28-
// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
29-
// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15
30-
// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
31-
// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15
32-
// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]])
33-
// CHECK-C-NEXT: ret void
34-
//
35-
// CHECK-CXX-LABEL: define dso_local void @_Z20test_svldr_vnum_za_1jPKv(
36-
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
37-
// CHECK-CXX-NEXT: entry:
38-
// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
39-
// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15
40-
// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
41-
// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15
42-
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]])
43-
// CHECK-CXX-NEXT: ret void
19+
// CHECK-C-LABEL: @test_svldr_vnum_za_1(
20+
// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv(
21+
// CHECK-NEXT: entry:
22+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15)
23+
// CHECK-NEXT: ret void
4424
//
4525
void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) {
4626
svldr_vnum_za(slice_base, ptr, 15);
4727
}
4828

49-
// CHECK-C-LABEL: define dso_local void @test_svldr_za(
50-
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
51-
// CHECK-C-NEXT: entry:
52-
// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]])
53-
// CHECK-C-NEXT: ret void
54-
//
55-
// CHECK-CXX-LABEL: define dso_local void @_Z13test_svldr_zajPKv(
56-
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
57-
// CHECK-CXX-NEXT: entry:
58-
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]])
59-
// CHECK-CXX-NEXT: ret void
29+
// CHECK-C-LABEL: @test_svldr_za(
30+
// CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv(
31+
// CHECK-NEXT: entry:
32+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
33+
// CHECK-NEXT: ret void
6034
//
6135
void test_svldr_za(uint32_t slice_base, const void *ptr) {
6236
svldr_za(slice_base, ptr);
6337
}
6438

65-
// CHECK-C-LABEL: define dso_local void @test_svldr_vnum_za_var(
66-
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
67-
// CHECK-C-NEXT: entry:
68-
// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
69-
// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]]
70-
// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
71-
// CHECK-C-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32
72-
// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]]
73-
// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]])
74-
// CHECK-C-NEXT: ret void
75-
//
76-
// CHECK-CXX-LABEL: define dso_local void @_Z22test_svldr_vnum_za_varjPKvl(
77-
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
78-
// CHECK-CXX-NEXT: entry:
79-
// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
80-
// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]]
81-
// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
82-
// CHECK-CXX-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32
83-
// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]]
84-
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]])
85-
// CHECK-CXX-NEXT: ret void
39+
// CHECK-C-LABEL: @test_svldr_vnum_za_var(
40+
// CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvl(
41+
// CHECK-NEXT: entry:
42+
// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32
43+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]])
44+
// CHECK-NEXT: ret void
8645
//
8746
void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) {
8847
svldr_vnum_za(slice_base, ptr, vnum);
8948
}
90-
//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
91-
// CHECK: {{.*}}
49+
50+
// CHECK-C-LABEL: @test_svldr_vnum_za_2(
51+
// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_2jPKv(
52+
// CHECK-NEXT: entry:
53+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16)
54+
// CHECK-NEXT: ret void
55+
//
56+
void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) {
57+
svldr_vnum_za(slice_base, ptr, 16);
58+
}

clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c

Lines changed: 31 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -6,86 +6,53 @@
66

77
#include <arm_sme_draft_spec_subject_to_change.h>
88

9-
// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za(
10-
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
11-
// CHECK-C-NEXT: entry:
12-
// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]])
13-
// CHECK-C-NEXT: ret void
14-
//
15-
// CHECK-CXX-LABEL: define dso_local void @_Z18test_svstr_vnum_zajPv(
16-
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
17-
// CHECK-CXX-NEXT: entry:
18-
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]])
19-
// CHECK-CXX-NEXT: ret void
9+
// CHECK-C-LABEL: @test_svstr_vnum_za(
10+
// CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv(
11+
// CHECK-NEXT: entry:
12+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
13+
// CHECK-NEXT: ret void
2014
//
2115
void test_svstr_vnum_za(uint32_t slice_base, void *ptr) {
2216
svstr_vnum_za(slice_base, ptr, 0);
2317
}
2418

25-
// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za_1(
26-
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
27-
// CHECK-C-NEXT: entry:
28-
// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
29-
// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15
30-
// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
31-
// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15
32-
// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]])
33-
// CHECK-C-NEXT: ret void
34-
//
35-
// CHECK-CXX-LABEL: define dso_local void @_Z20test_svstr_vnum_za_1jPv(
36-
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
37-
// CHECK-CXX-NEXT: entry:
38-
// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
39-
// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15
40-
// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
41-
// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15
42-
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]])
43-
// CHECK-CXX-NEXT: ret void
19+
// CHECK-C-LABEL: @test_svstr_vnum_za_1(
20+
// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_1jPv(
21+
// CHECK-NEXT: entry:
22+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15)
23+
// CHECK-NEXT: ret void
4424
//
4525
void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) {
4626
svstr_vnum_za(slice_base, ptr, 15);
4727
}
4828

49-
// CHECK-C-LABEL: define dso_local void @test_svstr_za(
50-
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
51-
// CHECK-C-NEXT: entry:
52-
// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]])
53-
// CHECK-C-NEXT: ret void
54-
//
55-
// CHECK-CXX-LABEL: define dso_local void @_Z13test_svstr_zajPv(
56-
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
57-
// CHECK-CXX-NEXT: entry:
58-
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]])
59-
// CHECK-CXX-NEXT: ret void
29+
// CHECK-C-LABEL: @test_svstr_za(
30+
// CHECK-CXX-LABEL: @_Z13test_svstr_zajPv(
31+
// CHECK-NEXT: entry:
32+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
33+
// CHECK-NEXT: ret void
6034
//
6135
void test_svstr_za(uint32_t slice_base, void *ptr) {
6236
svstr_za(slice_base, ptr);
6337
}
6438

65-
// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za_var(
66-
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
67-
// CHECK-C-NEXT: entry:
68-
// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
69-
// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]]
70-
// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
71-
// CHECK-C-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32
72-
// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]]
73-
// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]])
74-
// CHECK-C-NEXT: ret void
75-
//
76-
// CHECK-CXX-LABEL: define dso_local void @_Z22test_svstr_vnum_za_varjPvl(
77-
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
78-
// CHECK-CXX-NEXT: entry:
79-
// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
80-
// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]]
81-
// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
82-
// CHECK-CXX-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32
83-
// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]]
84-
// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]])
85-
// CHECK-CXX-NEXT: ret void
39+
// CHECK-C-LABEL: @test_svstr_vnum_za_var(
40+
// CHECK-CXX-LABEL: @_Z22test_svstr_vnum_za_varjPvl(
41+
// CHECK-NEXT: entry:
42+
// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32
43+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]])
44+
// CHECK-NEXT: ret void
8645
//
8746
void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) {
8847
svstr_vnum_za(slice_base, ptr, vnum);
8948
}
90-
//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
91-
// CHECK: {{.*}}
49+
50+
// CHECK-C-LABEL: @test_svstr_vnum_za_2(
51+
// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_2jPv(
52+
// CHECK-NEXT: entry:
53+
// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16)
54+
// CHECK-NEXT: ret void
55+
//
56+
void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) {
57+
svstr_vnum_za(slice_base, ptr, 16);
58+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2679,10 +2679,10 @@ let TargetPrefix = "aarch64" in {
26792679
def int_aarch64_sme_st1q_vert : SME_Load_Store_Intrinsic<llvm_nxv1i1_ty>;
26802680

26812681
// Spill + fill
2682-
def int_aarch64_sme_ldr : DefaultAttrsIntrinsic<
2683-
[], [llvm_i32_ty, llvm_ptr_ty]>;
2684-
def int_aarch64_sme_str : DefaultAttrsIntrinsic<
2685-
[], [llvm_i32_ty, llvm_ptr_ty]>;
2682+
class SME_LDR_STR_ZA_Intrinsic
2683+
: DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>;
2684+
def int_aarch64_sme_ldr : SME_LDR_STR_ZA_Intrinsic;
2685+
def int_aarch64_sme_str : SME_LDR_STR_ZA_Intrinsic;
26862686

26872687
class SME_TileToVector_Intrinsic
26882688
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
@@ -3454,4 +3454,9 @@ let TargetPrefix = "aarch64" in {
34543454
def int_aarch64_sve_sel_x2 : SVE2_VG2_Sel_Intrinsic;
34553455
def int_aarch64_sve_sel_x4 : SVE2_VG4_Sel_Intrinsic;
34563456

3457+
class SME_LDR_STR_ZT_Intrinsic
3458+
: DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>;
3459+
def int_aarch64_sme_ldr_zt : SME_LDR_STR_ZT_Intrinsic;
3460+
def int_aarch64_sme_str_zt : SME_LDR_STR_ZT_Intrinsic;
3461+
34573462
}

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2406,6 +2406,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
24062406
MAKE_CASE(AArch64ISD::FCMP)
24072407
MAKE_CASE(AArch64ISD::STRICT_FCMP)
24082408
MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2409+
MAKE_CASE(AArch64ISD::SME_ZA_LDR)
2410+
MAKE_CASE(AArch64ISD::SME_ZA_STR)
24092411
MAKE_CASE(AArch64ISD::DUP)
24102412
MAKE_CASE(AArch64ISD::DUPLANE8)
24112413
MAKE_CASE(AArch64ISD::DUPLANE16)
@@ -4830,6 +4832,90 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
48304832
Mask);
48314833
}
48324834

4835+
// Lower an SME LDR/STR ZA intrinsic
4836+
// Case 1: If the vector number (vecnum) is an immediate in range, it gets
4837+
// folded into the instruction
4838+
// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
4839+
// Case 2: If the vecnum is not an immediate, then it is used to modify the base
4840+
// and tile slice registers
4841+
// ldr(%tileslice, %ptr, %vecnum)
4842+
// ->
4843+
// %svl = rdsvl
4844+
// %ptr2 = %ptr + %svl * %vecnum
4845+
// %tileslice2 = %tileslice + %vecnum
4846+
// ldr [%tileslice2, 0], [%ptr2, 0]
4847+
// Case 3: If the vecnum is an immediate out of range, then the same is done as
4848+
// case 2, but the base and slice registers are modified by the greatest
4849+
// multiple of 15 lower than the vecnum and the remainder is folded into the
4850+
// instruction. This means that successive loads and stores that are offset from
4851+
// each other can share the same base and slice register updates.
4852+
// ldr(%tileslice, %ptr, 22)
4853+
// ldr(%tileslice, %ptr, 23)
4854+
// ->
4855+
// %svl = rdsvl
4856+
// %ptr2 = %ptr + %svl * 15
4857+
// %tileslice2 = %tileslice + 15
4858+
// ldr [%tileslice2, 7], [%ptr2, 7]
4859+
// ldr [%tileslice2, 8], [%ptr2, 8]
4860+
// Case 4: If the vecnum is an add of an immediate, then the non-immediate
4861+
// operand and the immediate can be folded into the instruction, like case 2.
4862+
// ldr(%tileslice, %ptr, %vecnum + 7)
4863+
// ldr(%tileslice, %ptr, %vecnum + 8)
4864+
// ->
4865+
// %svl = rdsvl
4866+
// %ptr2 = %ptr + %svl * %vecnum
4867+
// %tileslice2 = %tileslice + %vecnum
4868+
// ldr [%tileslice2, 7], [%ptr2, 7]
4869+
// ldr [%tileslice2, 8], [%ptr2, 8]
4870+
// Case 5: The vecnum being an add of an immediate out of range is also handled,
4871+
// in which case the same remainder logic as case 3 is used.
4872+
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
4873+
SDLoc DL(N);
4874+
4875+
SDValue TileSlice = N->getOperand(2);
4876+
SDValue Base = N->getOperand(3);
4877+
SDValue VecNum = N->getOperand(4);
4878+
int32_t ConstAddend = 0;
4879+
SDValue VarAddend = VecNum;
4880+
4881+
// If the vnum is an add of an immediate, we can fold it into the instruction
4882+
if (VecNum.getOpcode() == ISD::ADD &&
4883+
isa<ConstantSDNode>(VecNum.getOperand(1))) {
4884+
ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
4885+
VarAddend = VecNum.getOperand(0);
4886+
} else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
4887+
ConstAddend = ImmNode->getSExtValue();
4888+
VarAddend = SDValue();
4889+
}
4890+
4891+
int32_t ImmAddend = ConstAddend % 16;
4892+
if (int32_t C = (ConstAddend - ImmAddend)) {
4893+
SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
4894+
VarAddend = VarAddend
4895+
? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
4896+
: CVal;
4897+
}
4898+
4899+
if (VarAddend) {
4900+
// Get the vector length that will be multiplied by vnum
4901+
auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
4902+
DAG.getConstant(1, DL, MVT::i32));
4903+
4904+
// Multiply SVL and vnum then add it to the base
4905+
SDValue Mul = DAG.getNode(
4906+
ISD::MUL, DL, MVT::i64,
4907+
{SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
4908+
Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
4909+
// Just add vnum to the tileslice
4910+
TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
4911+
}
4912+
4913+
return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
4914+
DL, MVT::Other,
4915+
{/*Chain=*/N.getOperand(0), TileSlice, Base,
4916+
DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
4917+
}
4918+
48334919
SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
48344920
SelectionDAG &DAG) const {
48354921
unsigned IntNo = Op.getConstantOperandVal(1);
@@ -4853,6 +4939,10 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
48534939
return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
48544940
DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
48554941
}
4942+
case Intrinsic::aarch64_sme_str:
4943+
case Intrinsic::aarch64_sme_ldr: {
4944+
return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
4945+
}
48564946
case Intrinsic::aarch64_sme_za_enable:
48574947
return DAG.getNode(
48584948
AArch64ISD::SMSTART, DL, MVT::Other,

0 commit comments

Comments
 (0)