Skip to content

Commit cc8bf21

Browse files
committed
[llvm][ARM]Add ARM widen strings pass
- Pass optimizes memcpy's by padding out destinations and sources to a full word to make ARM backend generate full word loads instead of loading a single byte (ldrb) and/or half word (ldrh). Only pads destination when it's a stack allocated constant size array and source when it's constant string. Heuristic to decide whether to pad or not is very basic and could be improved to allow more examples to be padded. - Pass works at the midend level Change-Id: I1c6371f0962e7ad3c166602b800d041ac1cc7b04
1 parent 2a9f93b commit cc8bf21

12 files changed

+487
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
//===- ARMWidenStrings.h --------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file provides the interface for the ArmWidenStrings pass
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#ifndef LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
14+
#define LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
15+
16+
#include "llvm/IR/PassManager.h"
17+
18+
namespace llvm {
19+
20+
class Module;
21+
22+
class ARMWidenStringsPass : public PassInfoMixin<ARMWidenStringsPass> {
23+
public:
24+
ARMWidenStringsPass() = default;
25+
PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
26+
};
27+
28+
} // end namespace llvm
29+
30+
#endif // LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H

llvm/lib/Passes/PassBuilder.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@
207207
#include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
208208
#include "llvm/Transforms/ObjCARC.h"
209209
#include "llvm/Transforms/Scalar/ADCE.h"
210+
#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
210211
#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
211212
#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
212213
#include "llvm/Transforms/Scalar/BDCE.h"

llvm/lib/Passes/PassRegistry.def

+1
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,7 @@ FUNCTION_PASS("view-dom-only", DomOnlyViewer())
489489
FUNCTION_PASS("view-post-dom", PostDomViewer())
490490
FUNCTION_PASS("view-post-dom-only", PostDomOnlyViewer())
491491
FUNCTION_PASS("wasm-eh-prepare", WasmEHPreparePass())
492+
FUNCTION_PASS("arm-widen-strings", ARMWidenStringsPass())
492493
#undef FUNCTION_PASS
493494

494495
#ifndef FUNCTION_PASS_WITH_PARAMS
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
// ARMWidenStrings.cpp - Widen strings to word boundaries to speed up
2+
// programs that use simple strcpy's with constant strings as source
3+
// and stack allocated array for destination.
4+
5+
#define DEBUG_TYPE "arm-widen-strings"
6+
7+
#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
8+
#include "llvm/Analysis/LoopInfo.h"
9+
#include "llvm/IR/BasicBlock.h"
10+
#include "llvm/IR/Constants.h"
11+
#include "llvm/IR/Function.h"
12+
#include "llvm/IR/GlobalVariable.h"
13+
#include "llvm/IR/IRBuilder.h"
14+
#include "llvm/IR/Instructions.h"
15+
#include "llvm/IR/Intrinsics.h"
16+
#include "llvm/IR/Module.h"
17+
#include "llvm/IR/Operator.h"
18+
#include "llvm/IR/ValueSymbolTable.h"
19+
#include "llvm/Pass.h"
20+
#include "llvm/Support/CommandLine.h"
21+
#include "llvm/Support/Debug.h"
22+
#include "llvm/Support/raw_ostream.h"
23+
#include "llvm/TargetParser/Triple.h"
24+
#include "llvm/Transforms/Scalar.h"
25+
26+
using namespace llvm;
27+
28+
cl::opt<bool> DisableARMWidenStrings("disable-arm-widen-strings",
29+
cl::init(false));
30+
31+
namespace {
32+
33+
class ARMWidenStrings {
34+
public:
35+
/*
36+
Max number of bytes that memcpy allows for lowering to load/stores before it
37+
uses library function (__aeabi_memcpy). This is the same value returned by
38+
ARMSubtarget::getMaxInlineSizeThreshold which I would have called in place of
39+
the constant int but can't get access to the subtarget info class from the
40+
midend.
41+
*/
42+
const unsigned int MemcpyInliningLimit = 64;
43+
44+
bool run(Function &F);
45+
};
46+
47+
static bool IsCharArray(Type *t) {
48+
const unsigned int CHAR_BIT_SIZE = 8;
49+
return t && t->isArrayTy() && t->getArrayElementType()->isIntegerTy() &&
50+
t->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
51+
}
52+
53+
bool ARMWidenStrings::run(Function &F) {
54+
if (DisableARMWidenStrings) {
55+
return false;
56+
}
57+
58+
LLVM_DEBUG(dbgs() << "Running ARMWidenStrings on module " << F.getName()
59+
<< "\n");
60+
61+
for (Function::iterator b = F.begin(); b != F.end(); ++b) {
62+
for (BasicBlock::iterator i = b->begin(); i != b->end(); ++i) {
63+
CallInst *CI = dyn_cast<CallInst>(i);
64+
if (!CI) {
65+
continue;
66+
}
67+
68+
Function *CallMemcpy = CI->getCalledFunction();
69+
// find out if the current call instruction is a call to llvm memcpy
70+
// intrinsics
71+
if (CallMemcpy == NULL || !CallMemcpy->isIntrinsic() ||
72+
CallMemcpy->getIntrinsicID() != Intrinsic::memcpy) {
73+
continue;
74+
}
75+
76+
LLVM_DEBUG(dbgs() << "Found call to strcpy/memcpy:\n" << *CI << "\n");
77+
78+
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
79+
auto *SourceVar = dyn_cast<GlobalVariable>(CI->getArgOperand(1));
80+
auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
81+
auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
82+
83+
if (!BytesToCopy) {
84+
LLVM_DEBUG(dbgs() << "Number of bytes to copy is null\n");
85+
continue;
86+
}
87+
88+
uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
89+
90+
if (!Alloca) {
91+
LLVM_DEBUG(dbgs() << "Destination isn't a Alloca\n");
92+
continue;
93+
}
94+
95+
if (!SourceVar) {
96+
LLVM_DEBUG(dbgs() << "Source isn't a global constant variable\n");
97+
continue;
98+
}
99+
100+
if (!IsVolatile || IsVolatile->isOne()) {
101+
LLVM_DEBUG(
102+
dbgs() << "Not widening strings for this memcpy because it's "
103+
"a volatile operations\n");
104+
continue;
105+
}
106+
107+
if (NumBytesToCopy % 4 == 0) {
108+
LLVM_DEBUG(dbgs() << "Bytes to copy in strcpy/memcpy is already word "
109+
"aligned so nothing to do here.\n");
110+
continue;
111+
}
112+
113+
if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
114+
!SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr()) {
115+
LLVM_DEBUG(dbgs() << "Source is not constant global, thus it's "
116+
"mutable therefore it's not safe to pad\n");
117+
continue;
118+
}
119+
120+
ConstantDataArray *SourceDataArray =
121+
dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
122+
if (!SourceDataArray || !IsCharArray(SourceDataArray->getType())) {
123+
LLVM_DEBUG(dbgs() << "Source isn't a constant data array\n");
124+
continue;
125+
}
126+
127+
if (!Alloca->isStaticAlloca()) {
128+
LLVM_DEBUG(dbgs() << "Destination allocation isn't a static "
129+
"constant which is locally allocated in this "
130+
"function, so skipping.\n");
131+
continue;
132+
}
133+
134+
// Make sure destination is definitley a char array.
135+
if (!IsCharArray(Alloca->getAllocatedType())) {
136+
LLVM_DEBUG(dbgs() << "Destination doesn't look like a constant char (8 "
137+
"bits) array\n");
138+
continue;
139+
}
140+
LLVM_DEBUG(dbgs() << "With Alloca: " << *Alloca << "\n");
141+
142+
uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
143+
uint64_t SZSize = SourceDataArray->getType()->getNumElements();
144+
145+
// For safety purposes lets add a constraint and only padd when
146+
// num bytes to copy == destination array size == source string
147+
// which is a constant
148+
LLVM_DEBUG(dbgs() << "Number of bytes to copy is: " << NumBytesToCopy
149+
<< "\n");
150+
LLVM_DEBUG(dbgs() << "Size of destination array is: " << DZSize << "\n");
151+
LLVM_DEBUG(dbgs() << "Size of source array is: " << SZSize << "\n");
152+
if (NumBytesToCopy != DZSize || DZSize != SZSize) {
153+
LLVM_DEBUG(dbgs() << "Size of number of bytes to copy, destination "
154+
"array and source string don't match, so "
155+
"skipping\n");
156+
continue;
157+
}
158+
LLVM_DEBUG(dbgs() << "Going to widen.\n");
159+
unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
160+
LLVM_DEBUG(dbgs() << "Number of bytes to pad by is " << NumBytesToPad
161+
<< "\n");
162+
unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
163+
164+
if (TotalBytes > MemcpyInliningLimit) {
165+
LLVM_DEBUG(
166+
dbgs() << "Not going to pad because total number of bytes is "
167+
<< TotalBytes
168+
<< " which be greater than the inlining "
169+
"limit for memcpy which is "
170+
<< MemcpyInliningLimit << "\n");
171+
continue;
172+
}
173+
174+
// update destination char array to be word aligned (memcpy(X,...,...))
175+
IRBuilder<> BuildAlloca(Alloca);
176+
AllocaInst *NewAlloca = cast<AllocaInst>(BuildAlloca.CreateAlloca(
177+
ArrayType::get(Alloca->getAllocatedType()->getArrayElementType(),
178+
NumBytesToCopy + NumBytesToPad)));
179+
NewAlloca->takeName(Alloca);
180+
NewAlloca->setAlignment(Alloca->getAlign());
181+
Alloca->replaceAllUsesWith(NewAlloca);
182+
183+
LLVM_DEBUG(dbgs() << "Updating users of destination stack object to use "
184+
<< "new size\n");
185+
186+
// update source to be word aligned (memcpy(...,X,...))
187+
// create replacement string with padded null bytes.
188+
StringRef Data = SourceDataArray->getRawDataValues();
189+
std::vector<uint8_t> StrData(Data.begin(), Data.end());
190+
for (unsigned int p = 0; p < NumBytesToPad; p++)
191+
StrData.push_back('\0');
192+
auto Arr = ArrayRef(StrData.data(), TotalBytes);
193+
194+
// create new padded version of global variable string.
195+
Constant *SourceReplace = ConstantDataArray::get(F.getContext(), Arr);
196+
GlobalVariable *NewGV = new GlobalVariable(
197+
*F.getParent(), SourceReplace->getType(), true,
198+
SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
199+
200+
// copy any other attributes from original global variable string
201+
// e.g. unamed_addr
202+
NewGV->copyAttributesFrom(SourceVar);
203+
NewGV->takeName(SourceVar);
204+
205+
// replace intrinsic source.
206+
CI->setArgOperand(1, NewGV);
207+
208+
// Update number of bytes to copy (memcpy(...,...,X))
209+
CI->setArgOperand(2,
210+
ConstantInt::get(BytesToCopy->getType(), TotalBytes));
211+
LLVM_DEBUG(dbgs() << "Padded dest/source and increased number of bytes:\n"
212+
<< *CI << "\n"
213+
<< *NewAlloca << "\n");
214+
}
215+
}
216+
return true;
217+
}
218+
219+
} // end of anonymous namespace
220+
221+
PreservedAnalyses ARMWidenStringsPass::run(Function &F,
222+
FunctionAnalysisManager &AM) {
223+
if (!ARMWidenStrings().run(F))
224+
return PreservedAnalyses::all();
225+
226+
return PreservedAnalyses::none();
227+
}

llvm/lib/Transforms/Scalar/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMScalarOpts
22
ADCE.cpp
33
AlignmentFromAssumptions.cpp
44
AnnotationRemarks.cpp
5+
ARMWidenStrings.cpp
56
BDCE.cpp
67
CallSiteSplitting.cpp
78
ConstantHoisting.cpp
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,arm-widen-strings" -S | FileCheck %s
2+
; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O0>" -S | FileCheck %s --check-prefix=TURNED-OFF
3+
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
4+
5+
; CHECK: [12 x i8]
6+
; TURNED-OFF-NOT: [12 x i8]
7+
@.str = private unnamed_addr constant [10 x i8] c"123456789\00", align 1
8+
9+
; Function Attrs: nounwind
10+
define hidden void @foo() #0 {
11+
entry:
12+
; CHECK: %something = alloca [12 x i8]
13+
; TURNED-OFF-NOT: %something = alloca [12 x i8]
14+
%something = alloca [10 x i8], align 1
15+
%arraydecay = getelementptr inbounds [10 x i8], ptr %something, i32 0, i32 0
16+
; CHECK: @llvm.memcpy.p0.p0.i32
17+
%call = call ptr @strcpy(ptr %arraydecay, ptr @.str)
18+
%arraydecay1 = getelementptr inbounds [10 x i8], ptr %something, i32 0, i32 0
19+
%call2 = call i32 @bar(ptr %arraydecay1)
20+
ret void
21+
}
22+
23+
declare ptr @strcpy(ptr, ptr) #1
24+
25+
declare i32 @bar(...) #1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>,arm-widen-strings" -S | FileCheck %s
2+
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3+
4+
; CHECK: [64 x i8]
5+
@.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
6+
7+
; Function Attrs: nounwind
8+
define hidden void @foo() #0 {
9+
entry:
10+
; CHECK: %something = alloca [64 x i8]
11+
%something = alloca [62 x i8], align 1
12+
%arraydecay = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
13+
; CHECK: @llvm.memcpy.p0.p0.i32
14+
%call = call ptr @strcpy(ptr %arraydecay, ptr @.str)
15+
%arraydecay1 = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
16+
%call2 = call i32 @bar(ptr %arraydecay1)
17+
ret void
18+
}
19+
20+
declare ptr @strcpy(ptr, ptr) #1
21+
22+
declare i32 @bar(...) #1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
2+
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3+
target triple = "thumbv6m-arm-none-eabi"
4+
5+
; CHECK: [17 x i8]
6+
@.str = private unnamed_addr constant [17 x i8] c"aaaaaaaaaaaaaaaa\00", align 1
7+
8+
; Function Attrs: nounwind
9+
define hidden void @foo() local_unnamed_addr #0 {
10+
entry:
11+
%something = alloca [20 x i8], align 1
12+
call void @llvm.lifetime.start(i64 20, ptr nonnull %something) #3
13+
call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 17, i1 false)
14+
%call2 = call i32 @bar(ptr nonnull %something) #3
15+
call void @llvm.lifetime.end(i64 20, ptr nonnull %something) #3
16+
ret void
17+
}
18+
19+
; Function Attrs: argmemonly nounwind
20+
declare void @llvm.lifetime.start(i64, ptr nocapture) #1
21+
22+
declare i32 @bar(...) local_unnamed_addr #2
23+
24+
; Function Attrs: argmemonly nounwind
25+
declare void @llvm.lifetime.end(i64, ptr nocapture) #1
26+
27+
; Function Attrs: argmemonly nounwind
28+
declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes=arm-widen-strings -S | FileCheck %s
2+
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3+
target triple = "thumbv6m-arm-none-eabi"
4+
5+
; CHECK: [65 x i8]
6+
; CHECK-NOT: [68 x i8]
7+
@.str = private unnamed_addr constant [65 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzz\00", align 1
8+
9+
; Function Attrs: nounwind
10+
define hidden void @foo() local_unnamed_addr #0 {
11+
entry:
12+
%something = alloca [65 x i8], align 1
13+
call void @llvm.lifetime.start(i64 65, ptr nonnull %something) #3
14+
call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 65, i1 false)
15+
%call2 = call i32 @bar(ptr nonnull %something) #3
16+
call void @llvm.lifetime.end(i64 65, ptr nonnull %something) #3
17+
ret void
18+
}
19+
20+
; Function Attrs: argmemonly nounwind
21+
declare void @llvm.lifetime.start(i64, ptr nocapture) #1
22+
23+
declare i32 @bar(...) local_unnamed_addr #2
24+
25+
; Function Attrs: argmemonly nounwind
26+
declare void @llvm.lifetime.end(i64, ptr nocapture) #1
27+
28+
; Function Attrs: argmemonly nounwind
29+
declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1

0 commit comments

Comments
 (0)