Skip to content

Commit 8131e19

Browse files
committed
[LegalizeTypes] Teach DAGTypeLegalizer::GenWidenVectorLoads to pad with undef if needed when concatenating small or loads to match a larger load
In the included test case the align 16 allowed the v23f32 load to handled as load v16f32, load v4f32, and load v4f32(one element not used). These loads all need to be concatenated together into a final vector. In this case we tried to concatenate the two v4f32 loads to match the type of the v16f32 load so we could do a second concat_vectors, but those loads alone only add up to v8f32. So we need to two v4f32 undefs to pad it. It appears we've tried to hack around a similar issue in this code before by adding undef padding to loads in one of the earlier loops in this function. Originally in r147964 by padding all loads narrower than previous loads to the same size. Later modifed to only the last load in r293088. This patch removes that earlier code and just handles it on demand where we know we need it. Fixes PR46820 Differential Revision: https://reviews.llvm.org/D84463
1 parent 891759d commit 8131e19

File tree

2 files changed

+59
-15
lines changed

2 files changed

+59
-15
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

+12-15
Original file line numberDiff line numberDiff line change
@@ -4912,7 +4912,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
49124912

49134913
int LdWidth = LdVT.getSizeInBits();
49144914
int WidthDiff = WidenWidth - LdWidth;
4915-
// Allow wider loads.
4915+
// Allow wider loads if they are sufficiently aligned to avoid memory faults
4916+
// and if the original load is simple.
49164917
unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
49174918

49184919
// Find the vector type that can load from.
@@ -4964,19 +4965,6 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
49644965
LD->getPointerInfo().getWithOffset(Offset),
49654966
LD->getOriginalAlign(), MMOFlags, AAInfo);
49664967
LdChain.push_back(L.getValue(1));
4967-
if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
4968-
// Later code assumes the vector loads produced will be mergeable, so we
4969-
// must pad the final entry up to the previous width. Scalars are
4970-
// combined separately.
4971-
SmallVector<SDValue, 16> Loads;
4972-
Loads.push_back(L);
4973-
unsigned size = L->getValueSizeInBits(0);
4974-
while (size < LdOp->getValueSizeInBits(0)) {
4975-
Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
4976-
size += L->getValueSizeInBits(0);
4977-
}
4978-
L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
4979-
}
49804968
} else {
49814969
L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
49824970
LD->getPointerInfo().getWithOffset(Offset),
@@ -5017,8 +5005,17 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
50175005
EVT NewLdTy = LdOps[i].getValueType();
50185006
if (NewLdTy != LdTy) {
50195007
// Create a larger vector.
5008+
unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits();
5009+
assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0);
5010+
SmallVector<SDValue, 16> WidenOps(NumOps);
5011+
unsigned j = 0;
5012+
for (; j != End-Idx; ++j)
5013+
WidenOps[j] = ConcatOps[Idx+j];
5014+
for (; j != NumOps; ++j)
5015+
WidenOps[j] = DAG.getUNDEF(LdTy);
5016+
50205017
ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
5021-
makeArrayRef(&ConcatOps[Idx], End - Idx));
5018+
WidenOps);
50225019
Idx = End - 1;
50235020
LdTy = NewLdTy;
50245021
}

llvm/test/CodeGen/X86/pr46820.ll

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s
3+
4+
; The alignment of 16 causes type legalization to split this as 3 loads,
5+
; v16f32, v4f32, and v4f32. This loads 24 elements, but the load is aligned
6+
; to 16 bytes so this i safe. There was an issue with type legalization building
7+
; the proper concat_vectors for this because the two v4f32s don't add up to
8+
; v16f32 and require padding.
9+
10+
define <23 x float> @load23(<23 x float>* %p) {
11+
; CHECK-LABEL: load23:
12+
; CHECK: # %bb.0:
13+
; CHECK-NEXT: movq %rdi, %rax
14+
; CHECK-NEXT: vmovups 64(%rsi), %ymm0
15+
; CHECK-NEXT: vmovups (%rsi), %zmm1
16+
; CHECK-NEXT: vmovaps 64(%rsi), %xmm2
17+
; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
18+
; CHECK-NEXT: vmovss %xmm3, 88(%rdi)
19+
; CHECK-NEXT: vmovaps %xmm2, 64(%rdi)
20+
; CHECK-NEXT: vmovaps %zmm1, (%rdi)
21+
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
22+
; CHECK-NEXT: vmovlps %xmm0, 80(%rdi)
23+
; CHECK-NEXT: vzeroupper
24+
; CHECK-NEXT: retq
25+
%t0 = load <23 x float>, <23 x float>* %p, align 16
26+
ret <23 x float> %t0
27+
}
28+
29+
; Same test as above with minimal alignment just to demonstrate the different
30+
; codegen.
31+
define <23 x float> @load23_align_1(<23 x float>* %p) {
32+
; CHECK-LABEL: load23_align_1:
33+
; CHECK: # %bb.0:
34+
; CHECK-NEXT: movq %rdi, %rax
35+
; CHECK-NEXT: vmovups (%rsi), %zmm0
36+
; CHECK-NEXT: vmovups 64(%rsi), %xmm1
37+
; CHECK-NEXT: movq 80(%rsi), %rcx
38+
; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
39+
; CHECK-NEXT: vmovss %xmm2, 88(%rdi)
40+
; CHECK-NEXT: movq %rcx, 80(%rdi)
41+
; CHECK-NEXT: vmovaps %xmm1, 64(%rdi)
42+
; CHECK-NEXT: vmovaps %zmm0, (%rdi)
43+
; CHECK-NEXT: vzeroupper
44+
; CHECK-NEXT: retq
45+
%t0 = load <23 x float>, <23 x float>* %p, align 1
46+
ret <23 x float> %t0
47+
}

0 commit comments

Comments
 (0)