aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp4
-rw-r--r--lib/Target/X86/X86Subtarget.cpp18
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp184
-rw-r--r--test/CodeGen/X86/loop-strength-reduce-2.ll30
-rw-r--r--test/CodeGen/X86/loop-strength-reduce-3.ll30
-rw-r--r--test/CodeGen/X86/loop-strength-reduce.ll7
6 files changed, 212 insertions, 61 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 412a954..44f28d3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6463,6 +6463,10 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
// We can only fold this if we don't need an extra load.
if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
return false;
+ // If BaseGV requires a register, we cannot also have a BaseReg.
+ if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) &&
+ AM.HasBaseReg)
+ return false;
// X86-64 only supports addr of globals in small code model.
if (Subtarget->is64Bit()) {
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index b836471..583fe8a 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -59,7 +59,23 @@ bool X86Subtarget::GVRequiresExtraLoad(const GlobalValue* GV,
return (GV->hasDLLImportLinkage());
}
}
-
+ return false;
+}
+
+/// True if accessing the GV requires a register. This is a superset of the
+/// cases where GVRequiresExtraLoad is true. Some variations of PIC require
+/// a register, but not an extra load.
+bool X86Subtarget::GVRequiresRegister(const GlobalValue *GV,
+ const TargetMachine& TM,
+ bool isDirectCall) const
+{
+ if (GVRequiresExtraLoad(GV, TM, isDirectCall))
+ return true;
+ // Code below here need only consider cases where GVRequiresExtraLoad
+ // returns false.
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return !isDirectCall &&
+ (GV->hasInternalLinkage() || GV->hasExternalLinkage());
return false;
}
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 38fd7fa..9b792c1 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -444,7 +444,33 @@ static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV,
return true;
}
-
+/// isAddress - Returns true if the specified instruction is using the
+/// specified value as an address.
+static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+ bool isAddress = isa<LoadInst>(Inst);
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ if (SI->getOperand(1) == OperandVal)
+ isAddress = true;
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ // Addressing modes can also be folded into prefetches and a variety
+ // of intrinsics.
+ switch (II->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::prefetch:
+ case Intrinsic::x86_sse2_loadu_dq:
+ case Intrinsic::x86_sse2_loadu_pd:
+ case Intrinsic::x86_sse_loadu_ps:
+ case Intrinsic::x86_sse_storeu_ps:
+ case Intrinsic::x86_sse2_storeu_pd:
+ case Intrinsic::x86_sse2_storeu_dq:
+ case Intrinsic::x86_sse2_storel_dq:
+ if (II->getOperand(1) == OperandVal)
+ isAddress = true;
+ break;
+ }
+ }
+ return isAddress;
+}
/// AddUsersIfInteresting - Inspect the specified instruction. If it is a
/// reducible SCEV, recursively add its users to the IVUsesByStride set and
@@ -731,15 +757,16 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
}
-/// isTargetConstant - Return true if the following can be referenced by the
-/// immediate field of a target instruction.
-static bool isTargetConstant(const SCEVHandle &V, const Type *UseTy,
- const TargetLowering *TLI) {
+/// fitsInAddressMode - Return true if V can be subsumed within an addressing
+/// mode, and does not need to be put in a register first.
+static bool fitsInAddressMode(const SCEVHandle &V, const Type *UseTy,
+ const TargetLowering *TLI, bool HasBaseReg) {
if (SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) {
int64_t VC = SC->getValue()->getSExtValue();
if (TLI) {
TargetLowering::AddrMode AM;
AM.BaseOffs = VC;
+ AM.HasBaseReg = HasBaseReg;
return TLI->isLegalAddressingMode(AM, UseTy);
} else {
// Defaults to PPC. PPC allows a sign-extended 16-bit immediate field.
@@ -754,6 +781,7 @@ static bool isTargetConstant(const SCEVHandle &V, const Type *UseTy,
if (GlobalValue *GV = dyn_cast<GlobalValue>(Op0)) {
TargetLowering::AddrMode AM;
AM.BaseGV = GV;
+ AM.HasBaseReg = HasBaseReg;
return TLI->isLegalAddressingMode(AM, UseTy);
}
}
@@ -846,7 +874,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,
return;
} else if (SCEVMulExpr *SME = dyn_cast<SCEVMulExpr>(Val)) {
// Transform "8 * (4 + v)" -> "32 + 8*V" if "32" fits in the immed field.
- if (isAddress && isTargetConstant(SME->getOperand(0), UseTy, TLI) &&
+ if (isAddress && fitsInAddressMode(SME->getOperand(0), UseTy, TLI, false) &&
SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) {
SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType());
@@ -859,7 +887,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,
// Scale SubImm up by "8". If the result is a target constant, we are
// good.
SubImm = SE->getMulExpr(SubImm, SME->getOperand(0));
- if (isTargetConstant(SubImm, UseTy, TLI)) {
+ if (fitsInAddressMode(SubImm, UseTy, TLI, false)) {
// Accumulate the immediate.
Imm = SE->getAddExpr(Imm, SubImm);
@@ -873,7 +901,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,
// Loop-variant expressions must stay in the immediate field of the
// expression.
- if ((isAddress && isTargetConstant(Val, UseTy, TLI)) ||
+ if ((isAddress && fitsInAddressMode(Val, UseTy, TLI, false)) ||
!Val->isLoopInvariant(L)) {
Imm = SE->getAddExpr(Imm, Val);
Val = SE->getIntegerSCEV(0, Val->getType());
@@ -912,21 +940,28 @@ static void SeparateSubExprs(std::vector<SCEVHandle> &SubExprs,
}
}
+// This is logically local to the following function, but C++ says we have
+// to make it file scope.
+struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
-/// RemoveCommonExpressionsFromUseBases - Look through all of the uses in Bases,
-/// removing any common subexpressions from it. Anything truly common is
-/// removed, accumulated, and returned. This looks for things like (a+b+c) and
+/// RemoveCommonExpressionsFromUseBases - Look through all of the Bases of all
+/// the Uses, removing any common subexpressions, except that if all such
+/// subexpressions can be folded into an addressing mode for all uses inside
+/// the loop (this case is referred to as "free" in comments herein) we do
+/// not remove anything. This looks for things like (a+b+c) and
/// (a+c+d) and computes the common (a+c) subexpression. The common expression
/// is *removed* from the Bases and returned.
static SCEVHandle
RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
- ScalarEvolution *SE, Loop *L) {
+ ScalarEvolution *SE, Loop *L,
+ const TargetLowering *TLI) {
unsigned NumUses = Uses.size();
// Only one use? This is a very common case, so we handle it specially and
// cheaply.
SCEVHandle Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType());
SCEVHandle Result = Zero;
+ SCEVHandle FreeResult = Zero;
if (NumUses == 1) {
// If the use is inside the loop, use its base, regardless of what it is:
// it is clearly shared across all the IV's. If the use is outside the loop
@@ -939,7 +974,10 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
// To find common subexpressions, count how many of Uses use each expression.
// If any subexpressions are used Uses.size() times, they are common.
- std::map<SCEVHandle, unsigned> SubExpressionUseCounts;
+ // Also track whether all uses of each expression can be moved into an
+ // an addressing mode "for free"; such expressions are left within the loop.
+ // struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
+ std::map<SCEVHandle, SubExprUseData> SubExpressionUseData;
// UniqueSubExprs - Keep track of all of the subexpressions we see in the
// order we see them.
@@ -962,31 +1000,89 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
// CSEs we can find.
if (Uses[i].Base == Zero) return Zero;
+ // If this use is as an address we may be able to put CSEs in the addressing
+ // mode rather than hoisting them.
+ bool isAddrUse = isAddressUse(Uses[i].Inst, Uses[i].OperandValToReplace);
+ // We may need the UseTy below, but only when isAddrUse, so compute it
+ // only in that case.
+ const Type *UseTy = 0;
+ if (isAddrUse) {
+ UseTy = Uses[i].Inst->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(Uses[i].Inst))
+ UseTy = SI->getOperand(0)->getType();
+ }
+
// Split the expression into subexprs.
SeparateSubExprs(SubExprs, Uses[i].Base, SE);
- // Add one to SubExpressionUseCounts for each subexpr present.
- for (unsigned j = 0, e = SubExprs.size(); j != e; ++j)
- if (++SubExpressionUseCounts[SubExprs[j]] == 1)
+ // Add one to SubExpressionUseData.Count for each subexpr present, and
+ // if the subexpr is not a valid immediate within an addressing mode use,
+ // set SubExpressionUseData.notAllUsesAreFree. We definitely want to
+ // hoist these out of the loop (if they are common to all uses).
+ for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) {
+ if (++SubExpressionUseData[SubExprs[j]].Count == 1)
UniqueSubExprs.push_back(SubExprs[j]);
+ if (!isAddrUse || !fitsInAddressMode(SubExprs[j], UseTy, TLI, false))
+ SubExpressionUseData[SubExprs[j]].notAllUsesAreFree = true;
+ }
SubExprs.clear();
}
// Now that we know how many times each is used, build Result. Iterate over
// UniqueSubexprs so that we have a stable ordering.
for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) {
- std::map<SCEVHandle, unsigned>::iterator I =
- SubExpressionUseCounts.find(UniqueSubExprs[i]);
- assert(I != SubExpressionUseCounts.end() && "Entry not found?");
- if (I->second == NumUsesInsideLoop) // Found CSE!
- Result = SE->getAddExpr(Result, I->first);
- else
- // Remove non-cse's from SubExpressionUseCounts.
- SubExpressionUseCounts.erase(I);
+ std::map<SCEVHandle, SubExprUseData>::iterator I =
+ SubExpressionUseData.find(UniqueSubExprs[i]);
+ assert(I != SubExpressionUseData.end() && "Entry not found?");
+ if (I->second.Count == NumUsesInsideLoop) { // Found CSE!
+ if (I->second.notAllUsesAreFree)
+ Result = SE->getAddExpr(Result, I->first);
+ else
+ FreeResult = SE->getAddExpr(FreeResult, I->first);
+ } else
+ // Remove non-cse's from SubExpressionUseData.
+ SubExpressionUseData.erase(I);
}
-
+
+ if (FreeResult != Zero) {
+ // We have some subexpressions that can be subsumed into addressing
+ // modes in every use inside the loop. However, it's possible that
+ // there are so many of them that the combined FreeResult cannot
+ // be subsumed, or that the target cannot handle both a FreeResult
+ // and a Result in the same instruction (for example because it would
+ // require too many registers). Check this.
+ for (unsigned i=0; i<NumUses; ++i) {
+ if (!L->contains(Uses[i].Inst->getParent()))
+ continue;
+ // We know this is an addressing mode use; if there are any uses that
+ // are not, FreeResult would be Zero.
+ const Type *UseTy = Uses[i].Inst->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(Uses[i].Inst))
+ UseTy = SI->getOperand(0)->getType();
+ if (!fitsInAddressMode(FreeResult, UseTy, TLI, Result!=Zero)) {
+ // FIXME: could split up FreeResult into pieces here, some hoisted
+ // and some not. Doesn't seem worth it for now.
+ Result = SE->getAddExpr(Result, FreeResult);
+ FreeResult = Zero;
+ break;
+ }
+ }
+ }
+
// If we found no CSE's, return now.
if (Result == Zero) return Result;
+ // If we still have a FreeResult, remove its subexpressions from
+ // SubExpressionUseData. This means they will remain in the use Bases.
+ if (FreeResult != Zero) {
+ SeparateSubExprs(SubExprs, FreeResult, SE);
+ for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) {
+ std::map<SCEVHandle, SubExprUseData>::iterator I =
+ SubExpressionUseData.find(SubExprs[j]);
+ SubExpressionUseData.erase(I);
+ }
+ SubExprs.clear();
+ }
+
// Otherwise, remove all of the CSE's we found from each of the base values.
for (unsigned i = 0; i != NumUses; ++i) {
// Uses outside the loop don't necessarily include the common base, but
@@ -1003,7 +1099,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
// Remove any common subexpressions.
for (unsigned j = 0, e = SubExprs.size(); j != e; ++j)
- if (SubExpressionUseCounts.count(SubExprs[j])) {
+ if (SubExpressionUseData.count(SubExprs[j])) {
SubExprs.erase(SubExprs.begin()+j);
--j; --e;
}
@@ -1131,34 +1227,6 @@ static bool isNonConstantNegative(const SCEVHandle &Expr) {
return SC->getValue()->getValue().isNegative();
}
-/// isAddress - Returns true if the specified instruction is using the
-/// specified value as an address.
-static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
- bool isAddress = isa<LoadInst>(Inst);
- if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- if (SI->getOperand(1) == OperandVal)
- isAddress = true;
- } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
- // Addressing modes can also be folded into prefetches and a variety
- // of intrinsics.
- switch (II->getIntrinsicID()) {
- default: break;
- case Intrinsic::prefetch:
- case Intrinsic::x86_sse2_loadu_dq:
- case Intrinsic::x86_sse2_loadu_pd:
- case Intrinsic::x86_sse_loadu_ps:
- case Intrinsic::x86_sse_storeu_ps:
- case Intrinsic::x86_sse2_storeu_pd:
- case Intrinsic::x86_sse2_storeu_dq:
- case Intrinsic::x86_sse2_storel_dq:
- if (II->getOperand(1) == OperandVal)
- isAddress = true;
- break;
- }
- }
- return isAddress;
-}
-
// CollectIVUsers - Transform our list of users and offsets to a bit more
// complex table. In this new vector, each 'BasedUser' contains 'Base', the base
// of the strided accesses, as well as the old information from Uses. We
@@ -1190,7 +1258,7 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
// "A+B"), emit it to the preheader, then remove the expression from the
// UsersToProcess base values.
SCEVHandle CommonExprs =
- RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L);
+ RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L, TLI);
// Next, figure out what we can represent in the immediate fields of
// instructions. If we can represent anything there, move it to the imm
@@ -1347,7 +1415,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
Constant *C = dyn_cast<Constant>(CommonBaseV);
if (!C ||
(!C->isNullValue() &&
- !isTargetConstant(SE->getUnknown(CommonBaseV), ReplacedTy, TLI)))
+ !fitsInAddressMode(SE->getUnknown(CommonBaseV), ReplacedTy,
+ TLI, false)))
// We want the common base emitted into the preheader! This is just
// using cast as a copy so BitCast (no-op cast) is appropriate
CommonBaseV = new BitCastInst(CommonBaseV, CommonBaseV->getType(),
@@ -1403,7 +1472,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
// this by forcing a BitCast (noop cast) to be inserted into the preheader
// in this case.
if (Constant *C = dyn_cast<Constant>(BaseV)) {
- if (!C->isNullValue() && !isTargetConstant(Base, ReplacedTy, TLI)) {
+ if (!C->isNullValue() && !fitsInAddressMode(Base, ReplacedTy,
+ TLI, false)) {
// We want this constant emitted into the preheader! This is just
// using cast as a copy so BitCast (no-op cast) is appropriate
BaseV = new BitCastInst(BaseV, BaseV->getType(), "preheaderinsert",
diff --git a/test/CodeGen/X86/loop-strength-reduce-2.ll b/test/CodeGen/X86/loop-strength-reduce-2.ll
new file mode 100644
index 0000000..1375793
--- /dev/null
+++ b/test/CodeGen/X86/loop-strength-reduce-2.ll
@@ -0,0 +1,30 @@
+; RUN: llvm-as < %s | llc -march=x86 -relocation-model=pic | \
+; RUN: grep {A-} | count 1
+;
+; Make sure the common loop invariant A is hoisted up to preheader,
+; since too many registers are needed to subsume it into the addressing modes.
+
+@A = global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2]
+
+define void @test(i32 %row, i32 %N.in) nounwind {
+entry:
+ %N = bitcast i32 %N.in to i32 ; <i32> [#uses=1]
+ %tmp5 = icmp sgt i32 %N.in, 0 ; <i1> [#uses=1]
+ br i1 %tmp5, label %cond_true, label %return
+
+cond_true: ; preds = %cond_true, %entry
+ %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %cond_true ] ; <i32> [#uses=2]
+ %i.0.0 = bitcast i32 %indvar to i32 ; <i32> [#uses=2]
+ %tmp2 = add i32 %i.0.0, 1 ; <i32> [#uses=1]
+ %tmp = getelementptr [16 x [16 x i32]]* @A, i32 0, i32 %row, i32 %tmp2 ; <i32*> [#uses=1]
+ store i32 4, i32* %tmp
+ %tmp5.upgrd.1 = add i32 %i.0.0, 2 ; <i32> [#uses=1]
+ %tmp7 = getelementptr [16 x [16 x i32]]* @A, i32 0, i32 %row, i32 %tmp5.upgrd.1 ; <i32*> [#uses=1]
+ store i32 5, i32* %tmp7
+ %indvar.next = add i32 %indvar, 1 ; <i32> [#uses=2]
+ %exitcond = icmp eq i32 %indvar.next, %N ; <i1> [#uses=1]
+ br i1 %exitcond, label %return, label %cond_true
+
+return: ; preds = %cond_true, %entry
+ ret void
+}
diff --git a/test/CodeGen/X86/loop-strength-reduce-3.ll b/test/CodeGen/X86/loop-strength-reduce-3.ll
new file mode 100644
index 0000000..b6bb814
--- /dev/null
+++ b/test/CodeGen/X86/loop-strength-reduce-3.ll
@@ -0,0 +1,30 @@
+; RUN: llvm-as < %s | llc -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | \
+; RUN: grep {A+} | count 2
+;
+; Make sure the common loop invariant A is not hoisted up to preheader,
+; since it can be subsumed it into the addressing modes.
+
+@A = global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2]
+
+define void @test(i32 %row, i32 %N.in) nounwind {
+entry:
+ %N = bitcast i32 %N.in to i32 ; <i32> [#uses=1]
+ %tmp5 = icmp sgt i32 %N.in, 0 ; <i1> [#uses=1]
+ br i1 %tmp5, label %cond_true, label %return
+
+cond_true: ; preds = %cond_true, %entry
+ %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %cond_true ] ; <i32> [#uses=2]
+ %i.0.0 = bitcast i32 %indvar to i32 ; <i32> [#uses=2]
+ %tmp2 = add i32 %i.0.0, 1 ; <i32> [#uses=1]
+ %tmp = getelementptr [16 x [16 x i32]]* @A, i32 0, i32 %row, i32 %tmp2 ; <i32*> [#uses=1]
+ store i32 4, i32* %tmp
+ %tmp5.upgrd.1 = add i32 %i.0.0, 2 ; <i32> [#uses=1]
+ %tmp7 = getelementptr [16 x [16 x i32]]* @A, i32 0, i32 %row, i32 %tmp5.upgrd.1 ; <i32*> [#uses=1]
+ store i32 5, i32* %tmp7
+ %indvar.next = add i32 %indvar, 1 ; <i32> [#uses=2]
+ %exitcond = icmp eq i32 %indvar.next, %N ; <i1> [#uses=1]
+ br i1 %exitcond, label %return, label %cond_true
+
+return: ; preds = %cond_true, %entry
+ ret void
+}
diff --git a/test/CodeGen/X86/loop-strength-reduce.ll b/test/CodeGen/X86/loop-strength-reduce.ll
index 8bacbd9..8737101 100644
--- a/test/CodeGen/X86/loop-strength-reduce.ll
+++ b/test/CodeGen/X86/loop-strength-reduce.ll
@@ -1,7 +1,8 @@
-; RUN: llvm-as < %s | llc -march=x86 | \
-; RUN: grep {A(} | count 1
+; RUN: llvm-as < %s | llc -march=x86 -relocation-model=static | \
+; RUN: grep {A+} | count 2
;
-; Make sure the common loop invariant _A(reg) is hoisted up to preheader.
+; Make sure the common loop invariant A is not hoisted up to preheader,
+; since it can be subsumed into the addressing mode in all uses.
@A = internal global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2]