Skip to content

Commit d5f4f08

Browse files
authored
[RISCV] Always expand zero strided vp.strided.load (#98901)
This patch makes zero strided VP loads always be expanded to a scalar load and splat even if +optimized-zero-stride-load is present. Expanding it allows more .vx splat patterns to be matched, which is needed to prevent regressions in #98111. If the feature is present, RISCVISelDAGToDAG will combine it back to a zero strided load. The RV32 test diff also shows how need to emit a zero strided load either way after expanding an SEW=64 strided load. We could maybe fix this in a later patch by not doing the expand if SEW>XLEN.
1 parent 0309709 commit d5f4f08

File tree

3 files changed

+60
-5
lines changed

3 files changed

+60
-5
lines changed

llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,10 +163,10 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
163163
return true;
164164
}
165165

166+
// Always expand zero strided loads so we match more .vx splat patterns, even if
167+
// we have +optimized-zero-stride-loads. RISCVDAGToDAGISel::Select will convert
168+
// it back to a strided load if it's optimized.
166169
bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) {
167-
if (ST->hasOptimizedZeroStrideLoad())
168-
return false;
169-
170170
Value *BasePtr, *VL;
171171

172172
using namespace PatternMatch;

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -638,7 +638,7 @@ declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64,
638638
define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
639639
; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
640640
; CHECK-OPT: # %bb.0:
641-
; CHECK-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma
641+
; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
642642
; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero
643643
; CHECK-OPT-NEXT: ret
644644
;
@@ -657,7 +657,7 @@ define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
657657
define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
658658
; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
659659
; CHECK-OPT: # %bb.0:
660-
; CHECK-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
660+
; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
661661
; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero
662662
; CHECK-OPT-NEXT: ret
663663
;
@@ -670,3 +670,30 @@ define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
670670
%load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3)
671671
ret <4 x half> %load
672672
}
673+
674+
define <4 x i64> @zero_strided_vadd.vx(<4 x i64> %v, ptr %ptr) {
675+
; CHECK-RV32-LABEL: zero_strided_vadd.vx:
676+
; CHECK-RV32: # %bb.0:
677+
; CHECK-RV32-NEXT: addi sp, sp, -16
678+
; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
679+
; CHECK-RV32-NEXT: lw a1, 4(a0)
680+
; CHECK-RV32-NEXT: lw a0, 0(a0)
681+
; CHECK-RV32-NEXT: sw a1, 12(sp)
682+
; CHECK-RV32-NEXT: sw a0, 8(sp)
683+
; CHECK-RV32-NEXT: addi a0, sp, 8
684+
; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
685+
; CHECK-RV32-NEXT: vlse64.v v10, (a0), zero
686+
; CHECK-RV32-NEXT: vadd.vv v8, v8, v10
687+
; CHECK-RV32-NEXT: addi sp, sp, 16
688+
; CHECK-RV32-NEXT: ret
689+
;
690+
; CHECK-RV64-LABEL: zero_strided_vadd.vx:
691+
; CHECK-RV64: # %bb.0:
692+
; CHECK-RV64-NEXT: ld a0, 0(a0)
693+
; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
694+
; CHECK-RV64-NEXT: vadd.vx v8, v8, a0
695+
; CHECK-RV64-NEXT: ret
696+
%load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 4)
697+
%w = add <4 x i64> %v, %load
698+
ret <4 x i64> %w
699+
}

llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,3 +822,31 @@ define <vscale x 1 x half> @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) {
822822
%load = call <vscale x 1 x half> @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 4)
823823
ret <vscale x 1 x half> %load
824824
}
825+
826+
define <vscale x 1 x i64> @zero_strided_vadd.vx(<vscale x 1 x i64> %v, ptr %ptr) {
827+
; CHECK-RV32-LABEL: zero_strided_vadd.vx:
828+
; CHECK-RV32: # %bb.0:
829+
; CHECK-RV32-NEXT: addi sp, sp, -16
830+
; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
831+
; CHECK-RV32-NEXT: lw a1, 4(a0)
832+
; CHECK-RV32-NEXT: lw a0, 0(a0)
833+
; CHECK-RV32-NEXT: sw a1, 12(sp)
834+
; CHECK-RV32-NEXT: sw a0, 8(sp)
835+
; CHECK-RV32-NEXT: addi a0, sp, 8
836+
; CHECK-RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
837+
; CHECK-RV32-NEXT: vlse64.v v9, (a0), zero
838+
; CHECK-RV32-NEXT: vadd.vv v8, v8, v9
839+
; CHECK-RV32-NEXT: addi sp, sp, 16
840+
; CHECK-RV32-NEXT: ret
841+
;
842+
; CHECK-RV64-LABEL: zero_strided_vadd.vx:
843+
; CHECK-RV64: # %bb.0:
844+
; CHECK-RV64-NEXT: ld a0, 0(a0)
845+
; CHECK-RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma
846+
; CHECK-RV64-NEXT: vadd.vx v8, v8, a0
847+
; CHECK-RV64-NEXT: ret
848+
%vscale = call i32 @llvm.vscale()
849+
%load = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 %vscale)
850+
%w = add <vscale x 1 x i64> %v, %load
851+
ret <vscale x 1 x i64> %w
852+
}

0 commit comments

Comments
 (0)