Skip to content

Commit 2ad9ceb

Browse files
author
serge-sans-paille
committed
Fix interaction between stack alignment and inline-asm stack clash protection
As reported in rust-lang/rust#70143 alignment is not taken into account when doing the probing. Fix that by adjusting the first probe if the stack align is small, or by extending the dynamic probing if the alignment is large. Differential Revision: https://reviews.llvm.org/D84419 (cherry picked from commit f2c6bfa)
1 parent 2ae1278 commit 2ad9ceb

6 files changed

+512
-51
lines changed

llvm/lib/Target/X86/X86FrameLowering.cpp

+202-20
Original file line numberDiff line numberDiff line change
@@ -586,29 +586,55 @@ void X86FrameLowering::emitStackProbeInlineGeneric(
586586
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
587587
uint64_t ProbeChunk = StackProbeSize * 8;
588588

589+
uint64_t MaxAlign =
590+
TRI->needsStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;
591+
589592
// Synthesize a loop or unroll it, depending on the number of iterations.
593+
// BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
594+
// between the unaligned rsp and current rsp.
590595
if (Offset > ProbeChunk) {
591-
emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset);
596+
emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset,
597+
MaxAlign % StackProbeSize);
592598
} else {
593-
emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset);
599+
emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset,
600+
MaxAlign % StackProbeSize);
594601
}
595602
}
596603

597604
void X86FrameLowering::emitStackProbeInlineGenericBlock(
598605
MachineFunction &MF, MachineBasicBlock &MBB,
599-
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
600-
uint64_t Offset) const {
606+
MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
607+
uint64_t AlignOffset) const {
601608

602609
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
603610
const X86TargetLowering &TLI = *STI.getTargetLowering();
604611
const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
605612
const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
606613
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
614+
607615
uint64_t CurrentOffset = 0;
608-
// 0 Thanks to return address being saved on the stack
609-
uint64_t CurrentProbeOffset = 0;
610616

611-
// For the first N - 1 pages, just probe. I tried to take advantage of
617+
assert(AlignOffset < StackProbeSize);
618+
619+
// If the offset is so small it fits within a page, there's nothing to do.
620+
if (StackProbeSize < Offset + AlignOffset) {
621+
622+
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
623+
.addReg(StackPtr)
624+
.addImm(StackProbeSize - AlignOffset)
625+
.setMIFlag(MachineInstr::FrameSetup);
626+
MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
627+
628+
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
629+
.setMIFlag(MachineInstr::FrameSetup),
630+
StackPtr, false, 0)
631+
.addImm(0)
632+
.setMIFlag(MachineInstr::FrameSetup);
633+
NumFrameExtraProbe++;
634+
CurrentOffset = StackProbeSize - AlignOffset;
635+
}
636+
637+
// For the next N - 1 pages, just probe. I tried to take advantage of
612638
// natural probes but it implies much more logic and there was very few
613639
// interesting natural probes to interleave.
614640
while (CurrentOffset + StackProbeSize < Offset) {
@@ -626,9 +652,9 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock(
626652
.setMIFlag(MachineInstr::FrameSetup);
627653
NumFrameExtraProbe++;
628654
CurrentOffset += StackProbeSize;
629-
CurrentProbeOffset += StackProbeSize;
630655
}
631656

657+
// No need to probe the tail, it is smaller than a Page.
632658
uint64_t ChunkSize = Offset - CurrentOffset;
633659
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
634660
.addReg(StackPtr)
@@ -639,15 +665,35 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock(
639665

640666
void X86FrameLowering::emitStackProbeInlineGenericLoop(
641667
MachineFunction &MF, MachineBasicBlock &MBB,
642-
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
643-
uint64_t Offset) const {
668+
MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
669+
uint64_t AlignOffset) const {
644670
assert(Offset && "null offset");
645671

646672
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
647673
const X86TargetLowering &TLI = *STI.getTargetLowering();
648674
const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
649675
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
650676

677+
if (AlignOffset) {
678+
if (AlignOffset < StackProbeSize) {
679+
// Perform a first smaller allocation followed by a probe.
680+
const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset);
681+
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr)
682+
.addReg(StackPtr)
683+
.addImm(AlignOffset)
684+
.setMIFlag(MachineInstr::FrameSetup);
685+
MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
686+
687+
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
688+
.setMIFlag(MachineInstr::FrameSetup),
689+
StackPtr, false, 0)
690+
.addImm(0)
691+
.setMIFlag(MachineInstr::FrameSetup);
692+
NumFrameExtraProbe++;
693+
Offset -= AlignOffset;
694+
}
695+
}
696+
651697
// Synthesize a loop
652698
NumFrameLoopProbe++;
653699
const BasicBlock *LLVM_BB = MBB.getBasicBlock();
@@ -666,17 +712,17 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
666712

667713
// save loop bound
668714
{
669-
const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
670-
BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed)
715+
const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset);
716+
BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)
671717
.addReg(FinalStackProbed)
672718
.addImm(Offset / StackProbeSize * StackProbeSize)
673719
.setMIFlag(MachineInstr::FrameSetup);
674720
}
675721

676722
// allocate a page
677723
{
678-
const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
679-
BuildMI(testMBB, DL, TII.get(Opc), StackPtr)
724+
const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
725+
BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr)
680726
.addReg(StackPtr)
681727
.addImm(StackProbeSize)
682728
.setMIFlag(MachineInstr::FrameSetup);
@@ -1052,13 +1098,149 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
10521098
uint64_t MaxAlign) const {
10531099
uint64_t Val = -MaxAlign;
10541100
unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
1055-
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
1056-
.addReg(Reg)
1057-
.addImm(Val)
1058-
.setMIFlag(MachineInstr::FrameSetup);
10591101

1060-
// The EFLAGS implicit def is dead.
1061-
MI->getOperand(3).setIsDead();
1102+
MachineFunction &MF = *MBB.getParent();
1103+
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
1104+
const X86TargetLowering &TLI = *STI.getTargetLowering();
1105+
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
1106+
const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
1107+
1108+
// We want to make sure that (in worst case) less than StackProbeSize bytes
1109+
// are not probed after the AND. This assumption is used in
1110+
// emitStackProbeInlineGeneric.
1111+
if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {
1112+
{
1113+
NumFrameLoopProbe++;
1114+
MachineBasicBlock *entryMBB =
1115+
MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1116+
MachineBasicBlock *headMBB =
1117+
MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1118+
MachineBasicBlock *bodyMBB =
1119+
MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1120+
MachineBasicBlock *footMBB =
1121+
MF.CreateMachineBasicBlock(MBB.getBasicBlock());
1122+
1123+
MachineFunction::iterator MBBIter = MBB.getIterator();
1124+
MF.insert(MBBIter, entryMBB);
1125+
MF.insert(MBBIter, headMBB);
1126+
MF.insert(MBBIter, bodyMBB);
1127+
MF.insert(MBBIter, footMBB);
1128+
const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
1129+
Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
1130+
1131+
// Setup entry block
1132+
{
1133+
1134+
entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI);
1135+
BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
1136+
.addReg(StackPtr)
1137+
.setMIFlag(MachineInstr::FrameSetup);
1138+
MachineInstr *MI =
1139+
BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed)
1140+
.addReg(FinalStackProbed)
1141+
.addImm(Val)
1142+
.setMIFlag(MachineInstr::FrameSetup);
1143+
1144+
// The EFLAGS implicit def is dead.
1145+
MI->getOperand(3).setIsDead();
1146+
1147+
BuildMI(entryMBB, DL,
1148+
TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1149+
.addReg(FinalStackProbed)
1150+
.addReg(StackPtr)
1151+
.setMIFlag(MachineInstr::FrameSetup);
1152+
BuildMI(entryMBB, DL, TII.get(X86::JCC_1))
1153+
.addMBB(&MBB)
1154+
.addImm(X86::COND_E)
1155+
.setMIFlag(MachineInstr::FrameSetup);
1156+
entryMBB->addSuccessor(headMBB);
1157+
entryMBB->addSuccessor(&MBB);
1158+
}
1159+
1160+
// Loop entry block
1161+
1162+
{
1163+
const unsigned SUBOpc =
1164+
getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
1165+
BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)
1166+
.addReg(StackPtr)
1167+
.addImm(StackProbeSize)
1168+
.setMIFlag(MachineInstr::FrameSetup);
1169+
1170+
BuildMI(headMBB, DL,
1171+
TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1172+
.addReg(FinalStackProbed)
1173+
.addReg(StackPtr)
1174+
.setMIFlag(MachineInstr::FrameSetup);
1175+
1176+
// jump
1177+
BuildMI(headMBB, DL, TII.get(X86::JCC_1))
1178+
.addMBB(footMBB)
1179+
.addImm(X86::COND_B)
1180+
.setMIFlag(MachineInstr::FrameSetup);
1181+
1182+
headMBB->addSuccessor(bodyMBB);
1183+
headMBB->addSuccessor(footMBB);
1184+
}
1185+
1186+
// setup loop body
1187+
{
1188+
addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc))
1189+
.setMIFlag(MachineInstr::FrameSetup),
1190+
StackPtr, false, 0)
1191+
.addImm(0)
1192+
.setMIFlag(MachineInstr::FrameSetup);
1193+
1194+
const unsigned SUBOpc =
1195+
getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
1196+
BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)
1197+
.addReg(StackPtr)
1198+
.addImm(StackProbeSize)
1199+
.setMIFlag(MachineInstr::FrameSetup);
1200+
1201+
// cmp with stack pointer bound
1202+
BuildMI(bodyMBB, DL,
1203+
TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1204+
.addReg(FinalStackProbed)
1205+
.addReg(StackPtr)
1206+
.setMIFlag(MachineInstr::FrameSetup);
1207+
1208+
// jump
1209+
BuildMI(bodyMBB, DL, TII.get(X86::JCC_1))
1210+
.addMBB(bodyMBB)
1211+
.addImm(X86::COND_B)
1212+
.setMIFlag(MachineInstr::FrameSetup);
1213+
bodyMBB->addSuccessor(bodyMBB);
1214+
bodyMBB->addSuccessor(footMBB);
1215+
}
1216+
1217+
// setup loop footer
1218+
{
1219+
BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr)
1220+
.addReg(FinalStackProbed)
1221+
.setMIFlag(MachineInstr::FrameSetup);
1222+
addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc))
1223+
.setMIFlag(MachineInstr::FrameSetup),
1224+
StackPtr, false, 0)
1225+
.addImm(0)
1226+
.setMIFlag(MachineInstr::FrameSetup);
1227+
footMBB->addSuccessor(&MBB);
1228+
}
1229+
1230+
recomputeLiveIns(*headMBB);
1231+
recomputeLiveIns(*bodyMBB);
1232+
recomputeLiveIns(*footMBB);
1233+
recomputeLiveIns(MBB);
1234+
}
1235+
} else {
1236+
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
1237+
.addReg(Reg)
1238+
.addImm(Val)
1239+
.setMIFlag(MachineInstr::FrameSetup);
1240+
1241+
// The EFLAGS implicit def is dead.
1242+
MI->getOperand(3).setIsDead();
1243+
}
10621244
}
10631245

10641246
bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {

llvm/lib/Target/X86/X86FrameLowering.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -213,14 +213,14 @@ class X86FrameLowering : public TargetFrameLowering {
213213
void emitStackProbeInlineGenericBlock(MachineFunction &MF,
214214
MachineBasicBlock &MBB,
215215
MachineBasicBlock::iterator MBBI,
216-
const DebugLoc &DL,
217-
uint64_t Offset) const;
216+
const DebugLoc &DL, uint64_t Offset,
217+
uint64_t Align) const;
218218

219219
void emitStackProbeInlineGenericLoop(MachineFunction &MF,
220220
MachineBasicBlock &MBB,
221221
MachineBasicBlock::iterator MBBI,
222-
const DebugLoc &DL,
223-
uint64_t Offset) const;
222+
const DebugLoc &DL, uint64_t Offset,
223+
uint64_t Align) const;
224224

225225
/// Emit a stub to later inline the target stack probe.
226226
MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
; RUN: llc < %s | FileCheck %s
2+
3+
4+
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5+
target triple = "x86_64-unknown-linux-gnu"
6+
7+
define i32 @foo_noprotect() local_unnamed_addr {
8+
; CHECK-LABEL: foo_noprotect:
9+
; CHECK: # %bb.0:
10+
; CHECK-NEXT: pushq %rbp
11+
; CHECK-NEXT: .cfi_def_cfa_offset 16
12+
; CHECK-NEXT: .cfi_offset %rbp, -16
13+
; CHECK-NEXT: movq %rsp, %rbp
14+
; CHECK-NEXT: .cfi_def_cfa_register %rbp
15+
; CHECK-NEXT: andq $-4096, %rsp # imm = 0xF000
16+
; CHECK-NEXT: subq $73728, %rsp # imm = 0x12000
17+
; CHECK-NEXT: movl $1, 392(%rsp)
18+
; CHECK-NEXT: movl $1, 28792(%rsp)
19+
; CHECK-NEXT: movl (%rsp), %eax
20+
; CHECK-NEXT: movq %rbp, %rsp
21+
; CHECK-NEXT: popq %rbp
22+
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
23+
; CHECK-NEXT: retq
24+
25+
26+
%a = alloca i32, i64 18000, align 4096
27+
%b0 = getelementptr inbounds i32, i32* %a, i64 98
28+
%b1 = getelementptr inbounds i32, i32* %a, i64 7198
29+
store volatile i32 1, i32* %b0
30+
store volatile i32 1, i32* %b1
31+
%c = load volatile i32, i32* %a
32+
ret i32 %c
33+
}
34+
35+
define i32 @foo_protect() local_unnamed_addr #0 {
36+
; CHECK-LABEL: foo_protect:
37+
; CHECK: # %bb.0:
38+
; CHECK-NEXT: pushq %rbp
39+
; CHECK-NEXT: .cfi_def_cfa_offset 16
40+
; CHECK-NEXT: .cfi_offset %rbp, -16
41+
; CHECK-NEXT: movq %rsp, %rbp
42+
; CHECK-NEXT: .cfi_def_cfa_register %rbp
43+
; CHECK-NEXT: movq %rsp, %r11
44+
; CHECK-NEXT: andq $-4096, %r11 # imm = 0xF000
45+
; CHECK-NEXT: cmpq %rsp, %r11
46+
; CHECK-NEXT: je .LBB1_4
47+
; CHECK-NEXT:# %bb.1:
48+
; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
49+
; CHECK-NEXT: cmpq %rsp, %r11
50+
; CHECK-NEXT: jb .LBB1_3
51+
; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1
52+
; CHECK-NEXT: movq $0, (%rsp)
53+
; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
54+
; CHECK-NEXT: cmpq %rsp, %r11
55+
; CHECK-NEXT: jb .LBB1_2
56+
; CHECK-NEXT:.LBB1_3:
57+
; CHECK-NEXT: movq %r11, %rsp
58+
; CHECK-NEXT: movq $0, (%rsp)
59+
; CHECK-NEXT:.LBB1_4:
60+
; CHECK-NEXT: movq %rsp, %r11
61+
; CHECK-NEXT: subq $73728, %r11 # imm = 0x12000
62+
; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1
63+
; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
64+
; CHECK-NEXT: movq $0, (%rsp)
65+
; CHECK-NEXT: cmpq %r11, %rsp
66+
; CHECK-NEXT: jne .LBB1_5
67+
; CHECK-NEXT:# %bb.6:
68+
; CHECK-NEXT: movl $1, 392(%rsp)
69+
; CHECK-NEXT: movl $1, 28792(%rsp)
70+
; CHECK-NEXT: movl (%rsp), %eax
71+
; CHECK-NEXT: movq %rbp, %rsp
72+
; CHECK-NEXT: popq %rbp
73+
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
74+
; CHECK-NEXT: retq
75+
76+
77+
78+
79+
%a = alloca i32, i64 18000, align 4096
80+
%b0 = getelementptr inbounds i32, i32* %a, i64 98
81+
%b1 = getelementptr inbounds i32, i32* %a, i64 7198
82+
store volatile i32 1, i32* %b0
83+
store volatile i32 1, i32* %b1
84+
%c = load volatile i32, i32* %a
85+
ret i32 %c
86+
}
87+
88+
attributes #0 = {"probe-stack"="inline-asm"}

0 commit comments

Comments
 (0)