aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Transforms/Scalar
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Scalar')
-rw-r--r--lib/Transforms/Scalar/ADCE.cpp70
-rw-r--r--lib/Transforms/Scalar/AlignmentFromAssumptions.cpp16
-rw-r--r--lib/Transforms/Scalar/Android.mk6
-rw-r--r--lib/Transforms/Scalar/BDCE.cpp411
-rw-r--r--lib/Transforms/Scalar/CMakeLists.txt10
-rw-r--r--lib/Transforms/Scalar/ConstantHoisting.cpp9
-rw-r--r--lib/Transforms/Scalar/ConstantProp.cpp9
-rw-r--r--lib/Transforms/Scalar/DCE.cpp8
-rw-r--r--lib/Transforms/Scalar/DeadStoreElimination.cpp2
-rw-r--r--lib/Transforms/Scalar/EarlyCSE.cpp635
-rw-r--r--lib/Transforms/Scalar/GVN.cpp516
-rw-r--r--lib/Transforms/Scalar/IndVarSimplify.cpp14
-rw-r--r--lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp1422
-rw-r--r--lib/Transforms/Scalar/JumpThreading.cpp30
-rw-r--r--lib/Transforms/Scalar/LICM.cpp401
-rw-r--r--lib/Transforms/Scalar/LLVMBuild.txt2
-rw-r--r--lib/Transforms/Scalar/LoopDeletion.cpp8
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp98
-rw-r--r--lib/Transforms/Scalar/LoopInstSimplify.cpp29
-rw-r--r--lib/Transforms/Scalar/LoopRerollPass.cpp1285
-rw-r--r--lib/Transforms/Scalar/LoopRotation.cpp98
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp36
-rw-r--r--lib/Transforms/Scalar/LoopUnrollPass.cpp495
-rw-r--r--lib/Transforms/Scalar/LoopUnswitch.cpp70
-rw-r--r--lib/Transforms/Scalar/LowerExpectIntrinsic.cpp192
-rw-r--r--lib/Transforms/Scalar/MemCpyOptimizer.cpp32
-rw-r--r--lib/Transforms/Scalar/MergedLoadStoreMotion.cpp83
-rw-r--r--lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp14
-rw-r--r--lib/Transforms/Scalar/PlaceSafepoints.cpp989
-rw-r--r--lib/Transforms/Scalar/Reassociate.cpp15
-rw-r--r--lib/Transforms/Scalar/Reg2Mem.cpp2
-rw-r--r--lib/Transforms/Scalar/RewriteStatepointsForGC.cpp1897
-rw-r--r--lib/Transforms/Scalar/SCCP.cpp14
-rw-r--r--lib/Transforms/Scalar/SROA.cpp1501
-rw-r--r--lib/Transforms/Scalar/SampleProfile.cpp6
-rw-r--r--lib/Transforms/Scalar/Scalar.cpp15
-rw-r--r--lib/Transforms/Scalar/ScalarReplAggregates.cpp31
-rw-r--r--lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp14
-rw-r--r--lib/Transforms/Scalar/SimplifyCFGPass.cpp118
-rw-r--r--lib/Transforms/Scalar/Sink.cpp8
-rw-r--r--lib/Transforms/Scalar/StraightLineStrengthReduce.cpp274
-rw-r--r--lib/Transforms/Scalar/StructurizeCFG.cpp75
-rw-r--r--lib/Transforms/Scalar/TailRecursionElimination.cpp6
43 files changed, 8965 insertions, 2001 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 3d91984..d6fc916 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -32,19 +32,18 @@ using namespace llvm;
STATISTIC(NumRemoved, "Number of instructions removed");
namespace {
- struct ADCE : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- ADCE() : FunctionPass(ID) {
- initializeADCEPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function& F) override;
+struct ADCE : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ ADCE() : FunctionPass(ID) {
+ initializeADCEPass(*PassRegistry::getPassRegistry());
+ }
- void getAnalysisUsage(AnalysisUsage& AU) const override {
- AU.setPreservesCFG();
- }
+ bool runOnFunction(Function& F) override;
- };
+ void getAnalysisUsage(AnalysisUsage& AU) const override {
+ AU.setPreservesCFG();
+ }
+};
}
char ADCE::ID = 0;
@@ -54,46 +53,45 @@ bool ADCE::runOnFunction(Function& F) {
if (skipOptnoneFunction(F))
return false;
- SmallPtrSet<Instruction*, 128> alive;
- SmallVector<Instruction*, 128> worklist;
+ SmallPtrSet<Instruction*, 128> Alive;
+ SmallVector<Instruction*, 128> Worklist;
// Collect the set of "root" instructions that are known live.
- for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
- if (isa<TerminatorInst>(I.getInstructionIterator()) ||
- isa<DbgInfoIntrinsic>(I.getInstructionIterator()) ||
- isa<LandingPadInst>(I.getInstructionIterator()) ||
- I->mayHaveSideEffects()) {
- alive.insert(I.getInstructionIterator());
- worklist.push_back(I.getInstructionIterator());
+ for (Instruction &I : inst_range(F)) {
+ if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
+ isa<LandingPadInst>(I) || I.mayHaveSideEffects()) {
+ Alive.insert(&I);
+ Worklist.push_back(&I);
}
+ }
// Propagate liveness backwards to operands.
- while (!worklist.empty()) {
- Instruction* curr = worklist.pop_back_val();
- for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
- OI != OE; ++OI)
- if (Instruction* Inst = dyn_cast<Instruction>(OI))
- if (alive.insert(Inst).second)
- worklist.push_back(Inst);
+ while (!Worklist.empty()) {
+ Instruction *Curr = Worklist.pop_back_val();
+ for (Use &OI : Curr->operands()) {
+ if (Instruction *Inst = dyn_cast<Instruction>(OI))
+ if (Alive.insert(Inst).second)
+ Worklist.push_back(Inst);
+ }
}
// The inverse of the live set is the dead set. These are those instructions
// which have no side effects and do not influence the control flow or return
// value of the function, and may therefore be deleted safely.
- // NOTE: We reuse the worklist vector here for memory efficiency.
- for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
- if (!alive.count(I.getInstructionIterator())) {
- worklist.push_back(I.getInstructionIterator());
- I->dropAllReferences();
+ // NOTE: We reuse the Worklist vector here for memory efficiency.
+ for (Instruction &I : inst_range(F)) {
+ if (!Alive.count(&I)) {
+ Worklist.push_back(&I);
+ I.dropAllReferences();
}
+ }
- for (SmallVectorImpl<Instruction *>::iterator I = worklist.begin(),
- E = worklist.end(); I != E; ++I) {
+ for (Instruction *&I : Worklist) {
++NumRemoved;
- (*I)->eraseFromParent();
+ I->eraseFromParent();
}
- return !worklist.empty();
+ return !Worklist.empty();
}
FunctionPass *llvm::createAggressiveDCEPass() {
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 06c3dfd..5c74885 100644
--- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -21,7 +21,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -53,12 +53,12 @@ struct AlignmentFromAssumptions : public FunctionPass {
bool runOnFunction(Function &F);
virtual void getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<AssumptionTracker>();
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolution>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
- AU.addPreserved<LoopInfo>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<ScalarEvolution>();
}
@@ -69,7 +69,6 @@ struct AlignmentFromAssumptions : public FunctionPass {
// another assumption later, then we may change the alignment at that point.
DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments;
- AssumptionTracker *AT;
ScalarEvolution *SE;
DominatorTree *DT;
const DataLayout *DL;
@@ -84,7 +83,7 @@ char AlignmentFromAssumptions::ID = 0;
static const char aip_name[] = "Alignment from assumptions";
INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
aip_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
@@ -411,7 +410,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
bool AlignmentFromAssumptions::runOnFunction(Function &F) {
bool Changed = false;
- AT = &getAnalysis<AssumptionTracker>();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
SE = &getAnalysis<ScalarEvolution>();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
@@ -420,8 +419,9 @@ bool AlignmentFromAssumptions::runOnFunction(Function &F) {
NewDestAlignments.clear();
NewSrcAlignments.clear();
- for (auto &I : AT->assumptions(&F))
- Changed |= processAssumption(I);
+ for (auto &AssumeVH : AC.assumptions())
+ if (AssumeVH)
+ Changed |= processAssumption(cast<CallInst>(AssumeVH));
return Changed;
}
diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk
index 9028b42..ed803cd 100644
--- a/lib/Transforms/Scalar/Android.mk
+++ b/lib/Transforms/Scalar/Android.mk
@@ -2,6 +2,7 @@ LOCAL_PATH:= $(call my-dir)
transforms_scalar_SRC_FILES := \
ADCE.cpp \
+ BDCE.cpp \
AlignmentFromAssumptions.cpp \
ConstantProp.cpp \
ConstantHoisting.cpp \
@@ -12,6 +13,7 @@ transforms_scalar_SRC_FILES := \
FlattenCFGPass.cpp \
GVN.cpp \
IndVarSimplify.cpp \
+ InductiveRangeCheckElimination.cpp \
JumpThreading.cpp \
LICM.cpp \
LoadCombine.cpp \
@@ -24,11 +26,14 @@ transforms_scalar_SRC_FILES := \
LoopUnrollPass.cpp \
LoopUnswitch.cpp \
LowerAtomic.cpp \
+ LowerExpectIntrinsic.cpp \
MemCpyOptimizer.cpp \
MergedLoadStoreMotion.cpp \
PartiallyInlineLibCalls.cpp \
+ PlaceSafepoints.cpp \
Reassociate.cpp \
Reg2Mem.cpp \
+ RewriteStatepointsForGC.cpp \
SCCP.cpp \
SROA.cpp \
SampleProfile.cpp \
@@ -38,6 +43,7 @@ transforms_scalar_SRC_FILES := \
SeparateConstOffsetFromGEP.cpp \
SimplifyCFGPass.cpp \
Sink.cpp \
+ StraightLineStrengthReduce.cpp \
StructurizeCFG.cpp \
TailRecursionElimination.cpp
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
new file mode 100644
index 0000000..c7bd79d
--- /dev/null
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -0,0 +1,411 @@
+//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Bit-Tracking Dead Code Elimination pass. Some
+// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
+// We track these dead bits and remove instructions that compute only these
+// dead bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bdce"
+
+STATISTIC(NumRemoved, "Number of instructions removed (unused)");
+STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+
+namespace {
+struct BDCE : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ BDCE() : FunctionPass(ID) {
+ initializeBDCEPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function& F) override;
+
+ void getAnalysisUsage(AnalysisUsage& AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ }
+
+ void determineLiveOperandBits(const Instruction *UserI,
+ const Instruction *I, unsigned OperandNo,
+ const APInt &AOut, APInt &AB,
+ APInt &KnownZero, APInt &KnownOne,
+ APInt &KnownZero2, APInt &KnownOne2);
+
+ AssumptionCache *AC;
+ const DataLayout *DL;
+ DominatorTree *DT;
+};
+}
+
+char BDCE::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+ false, false)
+
+static bool isAlwaysLive(Instruction *I) {
+ return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
+ isa<LandingPadInst>(I) || I->mayHaveSideEffects();
+}
+
+void BDCE::determineLiveOperandBits(const Instruction *UserI,
+ const Instruction *I, unsigned OperandNo,
+ const APInt &AOut, APInt &AB,
+ APInt &KnownZero, APInt &KnownOne,
+ APInt &KnownZero2, APInt &KnownOne2) {
+ unsigned BitWidth = AB.getBitWidth();
+
+ // We're called once per operand, but for some instructions, we need to
+ // compute known bits of both operands in order to determine the live bits of
+ // either (when both operands are instructions themselves). We don't,
+ // however, want to do this twice, so we cache the result in APInts that live
+ // in the caller. For the two-relevant-operands case, both operand values are
+ // provided here.
+ auto ComputeKnownBits = [&](unsigned BitWidth, const Value *V1,
+ const Value *V2) {
+ KnownZero = APInt(BitWidth, 0);
+ KnownOne = APInt(BitWidth, 0);
+ computeKnownBits(const_cast<Value*>(V1), KnownZero, KnownOne, DL, 0, AC,
+ UserI, DT);
+
+ if (V2) {
+ KnownZero2 = APInt(BitWidth, 0);
+ KnownOne2 = APInt(BitWidth, 0);
+ computeKnownBits(const_cast<Value*>(V2), KnownZero2, KnownOne2, DL, 0, AC,
+ UserI, DT);
+ }
+ };
+
+ switch (UserI->getOpcode()) {
+ default: break;
+ case Instruction::Call:
+ case Instruction::Invoke:
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI))
+ switch (II->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::bswap:
+ // The alive bits of the input are the swapped alive bits of
+ // the output.
+ AB = AOut.byteSwap();
+ break;
+ case Intrinsic::ctlz:
+ if (OperandNo == 0) {
+ // We need some output bits, so we need all bits of the
+ // input to the left of, and including, the leftmost bit
+ // known to be one.
+ ComputeKnownBits(BitWidth, I, nullptr);
+ AB = APInt::getHighBitsSet(BitWidth,
+ std::min(BitWidth, KnownOne.countLeadingZeros()+1));
+ }
+ break;
+ case Intrinsic::cttz:
+ if (OperandNo == 0) {
+ // We need some output bits, so we need all bits of the
+ // input to the right of, and including, the rightmost bit
+ // known to be one.
+ ComputeKnownBits(BitWidth, I, nullptr);
+ AB = APInt::getLowBitsSet(BitWidth,
+ std::min(BitWidth, KnownOne.countTrailingZeros()+1));
+ }
+ break;
+ }
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ // Find the highest live output bit. We don't need any more input
+ // bits than that (adds, and thus subtracts, ripple only to the
+ // left).
+ AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits());
+ break;
+ case Instruction::Shl:
+ if (OperandNo == 0)
+ if (ConstantInt *CI =
+ dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+ uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+ AB = AOut.lshr(ShiftAmt);
+
+ // If the shift is nuw/nsw, then the high bits are not dead
+ // (because we've promised that they *must* be zero).
+ const ShlOperator *S = cast<ShlOperator>(UserI);
+ if (S->hasNoSignedWrap())
+ AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
+ else if (S->hasNoUnsignedWrap())
+ AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+ }
+ break;
+ case Instruction::LShr:
+ if (OperandNo == 0)
+ if (ConstantInt *CI =
+ dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+ uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+ AB = AOut.shl(ShiftAmt);
+
+ // If the shift is exact, then the low bits are not dead
+ // (they must be zero).
+ if (cast<LShrOperator>(UserI)->isExact())
+ AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+ }
+ break;
+ case Instruction::AShr:
+ if (OperandNo == 0)
+ if (ConstantInt *CI =
+ dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+ uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+ AB = AOut.shl(ShiftAmt);
+ // Because the high input bit is replicated into the
+ // high-order bits of the result, if we need any of those
+ // bits, then we must keep the highest input bit.
+ if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt))
+ .getBoolValue())
+ AB.setBit(BitWidth-1);
+
+ // If the shift is exact, then the low bits are not dead
+ // (they must be zero).
+ if (cast<AShrOperator>(UserI)->isExact())
+ AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+ }
+ break;
+ case Instruction::And:
+ AB = AOut;
+
+ // For bits that are known zero, the corresponding bits in the
+ // other operand are dead (unless they're both zero, in which
+ // case they can't both be dead, so just mark the LHS bits as
+ // dead).
+ if (OperandNo == 0) {
+ ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+ AB &= ~KnownZero2;
+ } else {
+ if (!isa<Instruction>(UserI->getOperand(0)))
+ ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
+ AB &= ~(KnownZero & ~KnownZero2);
+ }
+ break;
+ case Instruction::Or:
+ AB = AOut;
+
+ // For bits that are known one, the corresponding bits in the
+ // other operand are dead (unless they're both one, in which
+ // case they can't both be dead, so just mark the LHS bits as
+ // dead).
+ if (OperandNo == 0) {
+ ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+ AB &= ~KnownOne2;
+ } else {
+ if (!isa<Instruction>(UserI->getOperand(0)))
+ ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
+ AB &= ~(KnownOne & ~KnownOne2);
+ }
+ break;
+ case Instruction::Xor:
+ case Instruction::PHI:
+ AB = AOut;
+ break;
+ case Instruction::Trunc:
+ AB = AOut.zext(BitWidth);
+ break;
+ case Instruction::ZExt:
+ AB = AOut.trunc(BitWidth);
+ break;
+ case Instruction::SExt:
+ AB = AOut.trunc(BitWidth);
+ // Because the high input bit is replicated into the
+ // high-order bits of the result, if we need any of those
+ // bits, then we must keep the highest input bit.
+ if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(),
+ AOut.getBitWidth() - BitWidth))
+ .getBoolValue())
+ AB.setBit(BitWidth-1);
+ break;
+ case Instruction::Select:
+ if (OperandNo != 0)
+ AB = AOut;
+ break;
+ }
+}
+
+bool BDCE::runOnFunction(Function& F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ DL = F.getParent()->getDataLayout();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ DenseMap<Instruction *, APInt> AliveBits;
+ SmallVector<Instruction*, 128> Worklist;
+
+ // The set of visited instructions (non-integer-typed only).
+ SmallPtrSet<Instruction*, 128> Visited;
+
+ // Collect the set of "root" instructions that are known live.
+ for (Instruction &I : inst_range(F)) {
+ if (!isAlwaysLive(&I))
+ continue;
+
+ DEBUG(dbgs() << "BDCE: Root: " << I << "\n");
+ // For integer-valued instructions, set up an initial empty set of alive
+ // bits and add the instruction to the work list. For other instructions
+ // add their operands to the work list (for integer values operands, mark
+ // all bits as live).
+ if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+ if (!AliveBits.count(&I)) {
+ AliveBits[&I] = APInt(IT->getBitWidth(), 0);
+ Worklist.push_back(&I);
+ }
+
+ continue;
+ }
+
+ // Non-integer-typed instructions...
+ for (Use &OI : I.operands()) {
+ if (Instruction *J = dyn_cast<Instruction>(OI)) {
+ if (IntegerType *IT = dyn_cast<IntegerType>(J->getType()))
+ AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth());
+ Worklist.push_back(J);
+ }
+ }
+ // To save memory, we don't add I to the Visited set here. Instead, we
+ // check isAlwaysLive on every instruction when searching for dead
+ // instructions later (we need to check isAlwaysLive for the
+ // integer-typed instructions anyway).
+ }
+
+ // Propagate liveness backwards to operands.
+ while (!Worklist.empty()) {
+ Instruction *UserI = Worklist.pop_back_val();
+
+ DEBUG(dbgs() << "BDCE: Visiting: " << *UserI);
+ APInt AOut;
+ if (UserI->getType()->isIntegerTy()) {
+ AOut = AliveBits[UserI];
+ DEBUG(dbgs() << " Alive Out: " << AOut);
+ }
+ DEBUG(dbgs() << "\n");
+
+ if (!UserI->getType()->isIntegerTy())
+ Visited.insert(UserI);
+
+ APInt KnownZero, KnownOne, KnownZero2, KnownOne2;
+ // Compute the set of alive bits for each operand. These are anded into the
+ // existing set, if any, and if that changes the set of alive bits, the
+ // operand is added to the work-list.
+ for (Use &OI : UserI->operands()) {
+ if (Instruction *I = dyn_cast<Instruction>(OI)) {
+ if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) {
+ unsigned BitWidth = IT->getBitWidth();
+ APInt AB = APInt::getAllOnesValue(BitWidth);
+ if (UserI->getType()->isIntegerTy() && !AOut &&
+ !isAlwaysLive(UserI)) {
+ AB = APInt(BitWidth, 0);
+ } else {
+ // If all bits of the output are dead, then all bits of the input
+ // Bits of each operand that are used to compute alive bits of the
+ // output are alive, all others are dead.
+ determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB,
+ KnownZero, KnownOne,
+ KnownZero2, KnownOne2);
+ }
+
+ // If we've added to the set of alive bits (or the operand has not
+ // been previously visited), then re-queue the operand to be visited
+ // again.
+ APInt ABPrev(BitWidth, 0);
+ auto ABI = AliveBits.find(I);
+ if (ABI != AliveBits.end())
+ ABPrev = ABI->second;
+
+ APInt ABNew = AB | ABPrev;
+ if (ABNew != ABPrev || ABI == AliveBits.end()) {
+ AliveBits[I] = std::move(ABNew);
+ Worklist.push_back(I);
+ }
+ } else if (!Visited.count(I)) {
+ Worklist.push_back(I);
+ }
+ }
+ }
+ }
+
+ bool Changed = false;
+ // The inverse of the live set is the dead set. These are those instructions
+ // which have no side effects and do not influence the control flow or return
+ // value of the function, and may therefore be deleted safely.
+ // NOTE: We reuse the Worklist vector here for memory efficiency.
+ for (Instruction &I : inst_range(F)) {
+ // For live instructions that have all dead bits, first make them dead by
+ // replacing all uses with something else. Then, if they don't need to
+ // remain live (because they have side effects, etc.) we can remove them.
+ if (I.getType()->isIntegerTy()) {
+ auto ABI = AliveBits.find(&I);
+ if (ABI != AliveBits.end()) {
+ if (ABI->second.getBoolValue())
+ continue;
+
+ DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+ // FIXME: In theory we could substitute undef here instead of zero.
+ // This should be reconsidered once we settle on the semantics of
+ // undef, poison, etc.
+ Value *Zero = ConstantInt::get(I.getType(), 0);
+ ++NumSimplified;
+ I.replaceAllUsesWith(Zero);
+ Changed = true;
+ }
+ } else if (Visited.count(&I)) {
+ continue;
+ }
+
+ if (isAlwaysLive(&I))
+ continue;
+
+ Worklist.push_back(&I);
+ I.dropAllReferences();
+ Changed = true;
+ }
+
+ for (Instruction *&I : Worklist) {
+ ++NumRemoved;
+ I->eraseFromParent();
+ }
+
+ return Changed;
+}
+
+FunctionPass *llvm::createBitTrackingDCEPass() {
+ return new BDCE();
+}
+
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index b3ee11e..d297eb1 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,6 +1,7 @@
add_llvm_library(LLVMScalarOpts
ADCE.cpp
AlignmentFromAssumptions.cpp
+ BDCE.cpp
ConstantHoisting.cpp
ConstantProp.cpp
CorrelatedValuePropagation.cpp
@@ -9,6 +10,7 @@ add_llvm_library(LLVMScalarOpts
EarlyCSE.cpp
FlattenCFGPass.cpp
GVN.cpp
+ InductiveRangeCheckElimination.cpp
IndVarSimplify.cpp
JumpThreading.cpp
LICM.cpp
@@ -22,11 +24,14 @@ add_llvm_library(LLVMScalarOpts
LoopUnrollPass.cpp
LoopUnswitch.cpp
LowerAtomic.cpp
+ LowerExpectIntrinsic.cpp
MemCpyOptimizer.cpp
MergedLoadStoreMotion.cpp
PartiallyInlineLibCalls.cpp
+ PlaceSafepoints.cpp
Reassociate.cpp
Reg2Mem.cpp
+ RewriteStatepointsForGC.cpp
SCCP.cpp
SROA.cpp
SampleProfile.cpp
@@ -36,8 +41,13 @@ add_llvm_library(LLVMScalarOpts
SeparateConstOffsetFromGEP.cpp
SimplifyCFGPass.cpp
Sink.cpp
+ StraightLineStrengthReduce.cpp
StructurizeCFG.cpp
TailRecursionElimination.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+ ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Scalar
)
add_dependencies(LLVMScalarOpts intrinsics_gen)
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 27c177a..e3aab4b 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -131,14 +131,14 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
}
private:
/// \brief Initialize the pass.
void setup(Function &Fn) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- TTI = &getAnalysis<TargetTransformInfo>();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
Entry = &Fn.getEntryBlock();
}
@@ -176,7 +176,7 @@ char ConstantHoisting::ID = 0;
INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting",
false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting",
false, false)
@@ -186,6 +186,9 @@ FunctionPass *llvm::createConstantHoistingPass() {
/// \brief Perform the constant hoisting optimization for the given function.
bool ConstantHoisting::runOnFunction(Function &Fn) {
+ if (skipOptnoneFunction(Fn))
+ return false;
+
DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index dd51ce1..29d4e05 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -26,7 +26,7 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Pass.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include <set>
using namespace llvm;
@@ -45,7 +45,7 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<TargetLibraryInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
};
}
@@ -53,7 +53,7 @@ namespace {
char ConstantPropagation::ID = 0;
INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
"Simple constant propagation", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(ConstantPropagation, "constprop",
"Simple constant propagation", false, false)
@@ -70,7 +70,8 @@ bool ConstantPropagation::runOnFunction(Function &F) {
bool Changed = false;
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
- TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
while (!WorkList.empty()) {
Instruction *I = *WorkList.begin();
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 99fac75..3b262a2 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -21,7 +21,7 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Pass.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -42,7 +42,8 @@ namespace {
bool runOnBasicBlock(BasicBlock &BB) override {
if (skipOptnoneFunction(BB))
return false;
- TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
bool Changed = false;
for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
Instruction *Inst = DI++;
@@ -95,7 +96,8 @@ bool DCE::runOnFunction(Function &F) {
if (skipOptnoneFunction(F))
return false;
- TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
// Start out with all of the instructions in the worklist...
std::vector<Instruction*> WorkList;
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a1ddc00..c2ce1d5 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -33,7 +33,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index cd2ecad..9309623 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -12,12 +12,13 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/ScopedHashTable.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
@@ -26,7 +27,8 @@
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/RecyclingAllocator.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include <deque>
using namespace llvm;
@@ -40,49 +42,44 @@ STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
STATISTIC(NumCSECall, "Number of call instructions CSE'd");
STATISTIC(NumDSE, "Number of trivial dead stores removed");
-static unsigned getHash(const void *V) {
- return DenseMapInfo<const void*>::getHashValue(V);
-}
-
//===----------------------------------------------------------------------===//
// SimpleValue
//===----------------------------------------------------------------------===//
namespace {
- /// SimpleValue - Instances of this struct represent available values in the
- /// scoped hash table.
- struct SimpleValue {
- Instruction *Inst;
+/// \brief Struct representing the available values in the scoped hash table.
+struct SimpleValue {
+ Instruction *Inst;
- SimpleValue(Instruction *I) : Inst(I) {
- assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
- }
+ SimpleValue(Instruction *I) : Inst(I) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
- bool isSentinel() const {
- return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
- Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
- }
+ bool isSentinel() const {
+ return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+ Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
- static bool canHandle(Instruction *Inst) {
- // This can only handle non-void readnone functions.
- if (CallInst *CI = dyn_cast<CallInst>(Inst))
- return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
- return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
- isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
- isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
- isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
- isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
- }
- };
+ static bool canHandle(Instruction *Inst) {
+ // This can only handle non-void readnone functions.
+ if (CallInst *CI = dyn_cast<CallInst>(Inst))
+ return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+ return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
+ isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
+ isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+ isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+ isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+ }
+};
}
namespace llvm {
-template<> struct DenseMapInfo<SimpleValue> {
+template <> struct DenseMapInfo<SimpleValue> {
static inline SimpleValue getEmptyKey() {
- return DenseMapInfo<Instruction*>::getEmptyKey();
+ return DenseMapInfo<Instruction *>::getEmptyKey();
}
static inline SimpleValue getTombstoneKey() {
- return DenseMapInfo<Instruction*>::getTombstoneKey();
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
}
static unsigned getHashValue(SimpleValue Val);
static bool isEqual(SimpleValue LHS, SimpleValue RHS);
@@ -92,7 +89,7 @@ template<> struct DenseMapInfo<SimpleValue> {
unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
Instruction *Inst = Val.Inst;
// Hash in all of the operands as pointers.
- if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) {
+ if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
Value *LHS = BinOp->getOperand(0);
Value *RHS = BinOp->getOperand(1);
if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
@@ -101,8 +98,9 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
if (isa<OverflowingBinaryOperator>(BinOp)) {
// Hash the overflow behavior
unsigned Overflow =
- BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap |
- BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap;
+ BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap |
+ BinOp->hasNoUnsignedWrap() *
+ OverflowingBinaryOperator::NoUnsignedWrap;
return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS);
}
@@ -135,12 +133,13 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) ||
isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
- isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction");
+ isa<ShuffleVectorInst>(Inst)) &&
+ "Invalid/unknown instruction");
// Mix in the opcode.
- return hash_combine(Inst->getOpcode(),
- hash_combine_range(Inst->value_op_begin(),
- Inst->value_op_end()));
+ return hash_combine(
+ Inst->getOpcode(),
+ hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
}
bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
@@ -149,22 +148,24 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
if (LHS.isSentinel() || RHS.isSentinel())
return LHSI == RHSI;
- if (LHSI->getOpcode() != RHSI->getOpcode()) return false;
- if (LHSI->isIdenticalTo(RHSI)) return true;
+ if (LHSI->getOpcode() != RHSI->getOpcode())
+ return false;
+ if (LHSI->isIdenticalTo(RHSI))
+ return true;
// If we're not strictly identical, we still might be a commutable instruction
if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
if (!LHSBinOp->isCommutative())
return false;
- assert(isa<BinaryOperator>(RHSI)
- && "same opcode, but different instruction type?");
+ assert(isa<BinaryOperator>(RHSI) &&
+ "same opcode, but different instruction type?");
BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
// Check overflow attributes
if (isa<OverflowingBinaryOperator>(LHSBinOp)) {
- assert(isa<OverflowingBinaryOperator>(RHSBinOp)
- && "same opcode, but different operator type?");
+ assert(isa<OverflowingBinaryOperator>(RHSBinOp) &&
+ "same opcode, but different operator type?");
if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() ||
LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap())
return false;
@@ -172,16 +173,16 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
// Commuted equality
return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
- LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+ LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
}
if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
- assert(isa<CmpInst>(RHSI)
- && "same opcode, but different instruction type?");
+ assert(isa<CmpInst>(RHSI) &&
+ "same opcode, but different instruction type?");
CmpInst *RHSCmp = cast<CmpInst>(RHSI);
// Commuted equality
return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
- LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
- LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+ LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+ LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
}
return false;
@@ -192,57 +193,52 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
//===----------------------------------------------------------------------===//
namespace {
- /// CallValue - Instances of this struct represent available call values in
- /// the scoped hash table.
- struct CallValue {
- Instruction *Inst;
+/// \brief Struct representing the available call values in the scoped hash
+/// table.
+struct CallValue {
+ Instruction *Inst;
- CallValue(Instruction *I) : Inst(I) {
- assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
- }
+ CallValue(Instruction *I) : Inst(I) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
- bool isSentinel() const {
- return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
- Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
- }
+ bool isSentinel() const {
+ return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+ Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
- static bool canHandle(Instruction *Inst) {
- // Don't value number anything that returns void.
- if (Inst->getType()->isVoidTy())
- return false;
+ static bool canHandle(Instruction *Inst) {
+ // Don't value number anything that returns void.
+ if (Inst->getType()->isVoidTy())
+ return false;
- CallInst *CI = dyn_cast<CallInst>(Inst);
- if (!CI || !CI->onlyReadsMemory())
- return false;
- return true;
- }
- };
+ CallInst *CI = dyn_cast<CallInst>(Inst);
+ if (!CI || !CI->onlyReadsMemory())
+ return false;
+ return true;
+ }
+};
}
namespace llvm {
- template<> struct DenseMapInfo<CallValue> {
- static inline CallValue getEmptyKey() {
- return DenseMapInfo<Instruction*>::getEmptyKey();
- }
- static inline CallValue getTombstoneKey() {
- return DenseMapInfo<Instruction*>::getTombstoneKey();
- }
- static unsigned getHashValue(CallValue Val);
- static bool isEqual(CallValue LHS, CallValue RHS);
- };
+template <> struct DenseMapInfo<CallValue> {
+ static inline CallValue getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+ static inline CallValue getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+ static unsigned getHashValue(CallValue Val);
+ static bool isEqual(CallValue LHS, CallValue RHS);
+};
}
+
unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
Instruction *Inst = Val.Inst;
- // Hash in all of the operands as pointers.
- unsigned Res = 0;
- for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) {
- assert(!Inst->getOperand(i)->getType()->isMetadataTy() &&
- "Cannot value number calls with metadata operands");
- Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
- }
-
- // Mix in the opcode.
- return (Res << 1) ^ Inst->getOpcode();
+ // Hash all of the operands as pointers and mix in the opcode.
+ return hash_combine(
+ Inst->getOpcode(),
+ hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
}
bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
@@ -252,103 +248,106 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
return LHSI->isIdenticalTo(RHSI);
}
-
//===----------------------------------------------------------------------===//
-// EarlyCSE pass.
+// EarlyCSE implementation
//===----------------------------------------------------------------------===//
namespace {
-
-/// EarlyCSE - This pass does a simple depth-first walk over the dominator
-/// tree, eliminating trivially redundant instructions and using instsimplify
-/// to canonicalize things as it goes. It is intended to be fast and catch
-/// obvious cases so that instcombine and other passes are more effective. It
-/// is expected that a later pass of GVN will catch the interesting/hard
-/// cases.
-class EarlyCSE : public FunctionPass {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSE {
public:
+ Function &F;
const DataLayout *DL;
- const TargetLibraryInfo *TLI;
- DominatorTree *DT;
- AssumptionTracker *AT;
- typedef RecyclingAllocator<BumpPtrAllocator,
- ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy;
- typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>,
+ const TargetLibraryInfo &TLI;
+ const TargetTransformInfo &TTI;
+ DominatorTree &DT;
+ AssumptionCache &AC;
+ typedef RecyclingAllocator<
+ BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
+ typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
AllocatorTy> ScopedHTType;
- /// AvailableValues - This scoped hash table contains the current values of
- /// all of our simple scalar expressions. As we walk down the domtree, we
- /// look to see if instructions are in this: if so, we replace them with what
- /// we find, otherwise we insert them so that dominated values can succeed in
- /// their lookup.
- ScopedHTType *AvailableValues;
-
- /// AvailableLoads - This scoped hash table contains the current values
- /// of loads. This allows us to get efficient access to dominating loads when
- /// we have a fully redundant load. In addition to the most recent load, we
- /// keep track of a generation count of the read, which is compared against
- /// the current generation count. The current generation count is
- /// incremented after every possibly writing memory operation, which ensures
- /// that we only CSE loads with other loads that have no intervening store.
- typedef RecyclingAllocator<BumpPtrAllocator,
- ScopedHashTableVal<Value*, std::pair<Value*, unsigned> > > LoadMapAllocator;
- typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>,
- DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType;
- LoadHTType *AvailableLoads;
-
- /// AvailableCalls - This scoped hash table contains the current values
- /// of read-only call values. It uses the same generation count as loads.
- typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType;
- CallHTType *AvailableCalls;
-
- /// CurrentGeneration - This is the current generation of the memory value.
+ /// \brief A scoped hash table of the current values of all of our simple
+ /// scalar expressions.
+ ///
+ /// As we walk down the domtree, we look to see if instructions are in this:
+ /// if so, we replace them with what we find, otherwise we insert them so
+ /// that dominated values can succeed in their lookup.
+ ScopedHTType AvailableValues;
+
+ /// \brief A scoped hash table of the current values of loads.
+ ///
+ /// This allows us to get efficient access to dominating loads when we have
+ /// a fully redundant load. In addition to the most recent load, we keep
+ /// track of a generation count of the read, which is compared against the
+ /// current generation count. The current generation count is incremented
+ /// after every possibly writing memory operation, which ensures that we only
+ /// CSE loads with other loads that have no intervening store.
+ typedef RecyclingAllocator<
+ BumpPtrAllocator,
+ ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>>
+ LoadMapAllocator;
+ typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>,
+ DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType;
+ LoadHTType AvailableLoads;
+
+ /// \brief A scoped hash table of the current values of read-only call
+ /// values.
+ ///
+ /// It uses the same generation count as loads.
+ typedef ScopedHashTable<CallValue, std::pair<Value *, unsigned>> CallHTType;
+ CallHTType AvailableCalls;
+
+ /// \brief This is the current generation of the memory value.
unsigned CurrentGeneration;
- static char ID;
- explicit EarlyCSE() : FunctionPass(ID) {
- initializeEarlyCSEPass(*PassRegistry::getPassRegistry());
+ /// \brief Set up the EarlyCSE runner for a particular function.
+ EarlyCSE(Function &F, const DataLayout *DL, const TargetLibraryInfo &TLI,
+ const TargetTransformInfo &TTI, DominatorTree &DT,
+ AssumptionCache &AC)
+ : F(F), DL(DL), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {
}
- bool runOnFunction(Function &F) override;
+ bool run();
private:
-
- // NodeScope - almost a POD, but needs to call the constructors for the
- // scoped hash tables so that a new scope gets pushed on. These are RAII so
- // that the scope gets popped when the NodeScope is destroyed.
+ // Almost a POD, but needs to call the constructors for the scoped hash
+ // tables so that a new scope gets pushed on. These are RAII so that the
+ // scope gets popped when the NodeScope is destroyed.
class NodeScope {
- public:
- NodeScope(ScopedHTType *availableValues,
- LoadHTType *availableLoads,
- CallHTType *availableCalls) :
- Scope(*availableValues),
- LoadScope(*availableLoads),
- CallScope(*availableCalls) {}
-
- private:
- NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION;
- void operator=(const NodeScope&) LLVM_DELETED_FUNCTION;
+ public:
+ NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+ CallHTType &AvailableCalls)
+ : Scope(AvailableValues), LoadScope(AvailableLoads),
+ CallScope(AvailableCalls) {}
+
+ private:
+ NodeScope(const NodeScope &) = delete;
+ void operator=(const NodeScope &) = delete;
ScopedHTType::ScopeTy Scope;
LoadHTType::ScopeTy LoadScope;
CallHTType::ScopeTy CallScope;
};
- // StackNode - contains all the needed information to create a stack for
- // doing a depth first tranversal of the tree. This includes scopes for
- // values, loads, and calls as well as the generation. There is a child
- // iterator so that the children do not need to be store spearately.
+ // Contains all the needed information to create a stack for doing a depth
+ // first tranversal of the tree. This includes scopes for values, loads, and
+ // calls as well as the generation. There is a child iterator so that the
+ // children do not need to be store spearately.
class StackNode {
- public:
- StackNode(ScopedHTType *availableValues,
- LoadHTType *availableLoads,
- CallHTType *availableCalls,
- unsigned cg, DomTreeNode *n,
- DomTreeNode::iterator child, DomTreeNode::iterator end) :
- CurrentGeneration(cg), ChildGeneration(cg), Node(n),
- ChildIter(child), EndIter(end),
- Scopes(availableValues, availableLoads, availableCalls),
- Processed(false) {}
+ public:
+ StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+ CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
+ DomTreeNode::iterator child, DomTreeNode::iterator end)
+ : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
+ EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls),
+ Processed(false) {}
// Accessors.
unsigned currentGeneration() { return CurrentGeneration; }
@@ -365,9 +364,9 @@ private:
bool isProcessed() { return Processed; }
void process() { Processed = true; }
- private:
- StackNode(const StackNode&) LLVM_DELETED_FUNCTION;
- void operator=(const StackNode&) LLVM_DELETED_FUNCTION;
+ private:
+ StackNode(const StackNode &) = delete;
+ void operator=(const StackNode &) = delete;
// Members.
unsigned CurrentGeneration;
@@ -379,31 +378,78 @@ private:
bool Processed;
};
+ /// \brief Wrapper class to handle memory instructions, including loads,
+ /// stores and intrinsic loads and stores defined by the target.
+ class ParseMemoryInst {
+ public:
+ ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
+ : Load(false), Store(false), Vol(false), MayReadFromMemory(false),
+ MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
+ MayReadFromMemory = Inst->mayReadFromMemory();
+ MayWriteToMemory = Inst->mayWriteToMemory();
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ MemIntrinsicInfo Info;
+ if (!TTI.getTgtMemIntrinsic(II, Info))
+ return;
+ if (Info.NumMemRefs == 1) {
+ Store = Info.WriteMem;
+ Load = Info.ReadMem;
+ MatchingId = Info.MatchingId;
+ MayReadFromMemory = Info.ReadMem;
+ MayWriteToMemory = Info.WriteMem;
+ Vol = Info.Vol;
+ Ptr = Info.PtrVal;
+ }
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ Load = true;
+ Vol = !LI->isSimple();
+ Ptr = LI->getPointerOperand();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ Store = true;
+ Vol = !SI->isSimple();
+ Ptr = SI->getPointerOperand();
+ }
+ }
+ bool isLoad() { return Load; }
+ bool isStore() { return Store; }
+ bool isVolatile() { return Vol; }
+ bool isMatchingMemLoc(const ParseMemoryInst &Inst) {
+ return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
+ }
+ bool isValid() { return Ptr != nullptr; }
+ int getMatchingId() { return MatchingId; }
+ Value *getPtr() { return Ptr; }
+ bool mayReadFromMemory() { return MayReadFromMemory; }
+ bool mayWriteToMemory() { return MayWriteToMemory; }
+
+ private:
+ bool Load;
+ bool Store;
+ bool Vol;
+ bool MayReadFromMemory;
+ bool MayWriteToMemory;
+ // For regular (non-intrinsic) loads/stores, this is set to -1. For
+ // intrinsic loads/stores, the id is retrieved from the corresponding
+ // field in the MemIntrinsicInfo structure. That field contains
+ // non-negative values only.
+ int MatchingId;
+ Value *Ptr;
+ };
+
bool processNode(DomTreeNode *Node);
- // This transformation requires dominator postdominator info
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfo>();
- AU.setPreservesCFG();
+ Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ return LI;
+ else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ return SI->getValueOperand();
+ assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
+ return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
+ ExpectedType);
}
};
}
-char EarlyCSE::ID = 0;
-
-// createEarlyCSEPass - The public interface to this file.
-FunctionPass *llvm::createEarlyCSEPass() {
- return new EarlyCSE();
-}
-
-INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
-INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false)
-
bool EarlyCSE::processNode(DomTreeNode *Node) {
BasicBlock *BB = Node->getBlock();
@@ -420,17 +466,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
/// as long as there in no instruction that reads memory. If we see a store
/// to the same location, we delete the dead store. This zaps trivial dead
/// stores which can occur in bitfield code among other things.
- StoreInst *LastStore = nullptr;
+ Instruction *LastStore = nullptr;
bool Changed = false;
// See if any instructions in the block can be eliminated. If so, do it. If
// not, add them to AvailableValues.
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
Instruction *Inst = I++;
// Dead instructions should just be removed.
- if (isInstructionTriviallyDead(Inst, TLI)) {
+ if (isInstructionTriviallyDead(Inst, &TLI)) {
DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
Inst->eraseFromParent();
Changed = true;
@@ -449,7 +495,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// If the instruction can be simplified (e.g. X+0 = X) then replace it with
// its simpler value.
- if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT, AT)) {
+ if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n');
Inst->replaceAllUsesWith(V);
Inst->eraseFromParent();
@@ -461,7 +507,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// If this is a simple instruction that we can value number, process it.
if (SimpleValue::canHandle(Inst)) {
// See if the instruction has an available value. If so, use it.
- if (Value *V = AvailableValues->lookup(Inst)) {
+ if (Value *V = AvailableValues.lookup(Inst)) {
DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n');
Inst->replaceAllUsesWith(V);
Inst->eraseFromParent();
@@ -471,52 +517,66 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
}
// Otherwise, just remember that this value is available.
- AvailableValues->insert(Inst, Inst);
+ AvailableValues.insert(Inst, Inst);
continue;
}
+ ParseMemoryInst MemInst(Inst, TTI);
// If this is a non-volatile load, process it.
- if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ if (MemInst.isValid() && MemInst.isLoad()) {
// Ignore volatile loads.
- if (!LI->isSimple()) {
+ if (MemInst.isVolatile()) {
LastStore = nullptr;
+ // Don't CSE across synchronization boundaries.
+ if (Inst->mayWriteToMemory())
+ ++CurrentGeneration;
continue;
}
// If we have an available version of this load, and if it is the right
// generation, replace this instruction.
- std::pair<Value*, unsigned> InVal =
- AvailableLoads->lookup(Inst->getOperand(0));
+ std::pair<Value *, unsigned> InVal =
+ AvailableLoads.lookup(MemInst.getPtr());
if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
- DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: "
- << *InVal.first << '\n');
- if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first);
- Inst->eraseFromParent();
- Changed = true;
- ++NumCSELoad;
- continue;
+ Value *Op = getOrCreateResult(InVal.first, Inst->getType());
+ if (Op != nullptr) {
+ DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+ << " to: " << *InVal.first << '\n');
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(Op);
+ Inst->eraseFromParent();
+ Changed = true;
+ ++NumCSELoad;
+ continue;
+ }
}
// Otherwise, remember that we have this instruction.
- AvailableLoads->insert(Inst->getOperand(0),
- std::pair<Value*, unsigned>(Inst, CurrentGeneration));
+ AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
+ Inst, CurrentGeneration));
LastStore = nullptr;
continue;
}
// If this instruction may read from memory, forget LastStore.
- if (Inst->mayReadFromMemory())
+ // Load/store intrinsics will indicate both a read and a write to
+ // memory. The target may override this (e.g. so that a store intrinsic
+ // does not read from memory, and thus will be treated the same as a
+ // regular store for commoning purposes).
+ if (Inst->mayReadFromMemory() &&
+ !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
LastStore = nullptr;
// If this is a read-only call, process it.
if (CallValue::canHandle(Inst)) {
// If we have an available version of this call, and if it is the right
// generation, replace this instruction.
- std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst);
+ std::pair<Value *, unsigned> InVal = AvailableCalls.lookup(Inst);
if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
- DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: "
- << *InVal.first << '\n');
- if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first);
+ DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+ << " to: " << *InVal.first << '\n');
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(InVal.first);
Inst->eraseFromParent();
Changed = true;
++NumCSECall;
@@ -524,8 +584,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
}
// Otherwise, remember that we have this instruction.
- AvailableCalls->insert(Inst,
- std::pair<Value*, unsigned>(Inst, CurrentGeneration));
+ AvailableCalls.insert(
+ Inst, std::pair<Value *, unsigned>(Inst, CurrentGeneration));
continue;
}
@@ -535,17 +595,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
if (Inst->mayWriteToMemory()) {
++CurrentGeneration;
- if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ if (MemInst.isValid() && MemInst.isStore()) {
// We do a trivial form of DSE if there are two stores to the same
// location with no intervening loads. Delete the earlier store.
- if (LastStore &&
- LastStore->getPointerOperand() == SI->getPointerOperand()) {
- DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: "
- << *Inst << '\n');
- LastStore->eraseFromParent();
- Changed = true;
- ++NumDSE;
- LastStore = nullptr;
+ if (LastStore) {
+ ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+ if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+ DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+ << " due to: " << *Inst << '\n');
+ LastStore->eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ LastStore = nullptr;
+ }
// fallthrough - we can exploit information about this store
}
@@ -554,12 +616,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// version of the pointer. It is safe to forward from volatile stores
// to non-volatile loads, so we don't have to check for volatility of
// the store.
- AvailableLoads->insert(SI->getPointerOperand(),
- std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration));
+ AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
+ Inst, CurrentGeneration));
// Remember that this was the last store we saw for DSE.
- if (SI->isSimple())
- LastStore = SI;
+ if (!MemInst.isVolatile())
+ LastStore = Inst;
}
}
}
@@ -567,40 +629,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
return Changed;
}
-
-bool EarlyCSE::runOnFunction(Function &F) {
- if (skipOptnoneFunction(F))
- return false;
-
- // Note, deque is being used here because there is significant performance gains
- // over vector when the container becomes very large due to the specific access
- // patterns. For more information see the mailing list discussion on this:
+bool EarlyCSE::run() {
+ // Note, deque is being used here because there is significant performance
+ // gains over vector when the container becomes very large due to the
+ // specific access patterns. For more information see the mailing list
+ // discussion on this:
// http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
std::deque<StackNode *> nodesToProcess;
- DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
- DL = DLP ? &DLP->getDataLayout() : nullptr;
- TLI = &getAnalysis<TargetLibraryInfo>();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- AT = &getAnalysis<AssumptionTracker>();
-
- // Tables that the pass uses when walking the domtree.
- ScopedHTType AVTable;
- AvailableValues = &AVTable;
- LoadHTType LoadTable;
- AvailableLoads = &LoadTable;
- CallHTType CallTable;
- AvailableCalls = &CallTable;
-
- CurrentGeneration = 0;
bool Changed = false;
// Process the root node.
- nodesToProcess.push_back(
- new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
- CurrentGeneration, DT->getRootNode(),
- DT->getRootNode()->begin(),
- DT->getRootNode()->end()));
+ nodesToProcess.push_back(new StackNode(
+ AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
+ DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
// Save the current generation.
unsigned LiveOutGeneration = CurrentGeneration;
@@ -624,11 +666,9 @@ bool EarlyCSE::runOnFunction(Function &F) {
// Push the next child onto the stack.
DomTreeNode *child = NodeToProcess->nextChild();
nodesToProcess.push_back(
- new StackNode(AvailableValues,
- AvailableLoads,
- AvailableCalls,
- NodeToProcess->childGeneration(), child,
- child->begin(), child->end()));
+ new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
+ NodeToProcess->childGeneration(), child, child->begin(),
+ child->end()));
} else {
// It has been processed, and there are no more children to process,
// so delete it and pop it off the stack.
@@ -642,3 +682,78 @@ bool EarlyCSE::runOnFunction(Function &F) {
return Changed;
}
+
+PreservedAnalyses EarlyCSEPass::run(Function &F,
+ AnalysisManager<Function> *AM) {
+ const DataLayout *DL = F.getParent()->getDataLayout();
+
+ auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM->getResult<TargetIRAnalysis>(F);
+ auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM->getResult<AssumptionAnalysis>(F);
+
+ EarlyCSE CSE(F, DL, TLI, TTI, DT, AC);
+
+ if (!CSE.run())
+ return PreservedAnalyses::all();
+
+ // CSE preserves the dominator tree because it doesn't mutate the CFG.
+ // FIXME: Bundle this with other CFG-preservation.
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
+
+namespace {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSELegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ EarlyCSELegacyPass() : FunctionPass(ID) {
+ initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+ auto *DL = DLP ? &DLP->getDataLayout() : nullptr;
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+ EarlyCSE CSE(F, DL, TLI, TTI, DT, AC);
+
+ return CSE.run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+}
+
+char EarlyCSELegacyPass::ID = 0;
+
+FunctionPass *llvm::createEarlyCSEPass() { return new EarlyCSELegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 7dba4e2..73a1f25 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -20,11 +20,12 @@
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
@@ -44,7 +45,7 @@
#include "llvm/Support/Allocator.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -457,7 +458,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
return e;
}
-/// lookup - Returns the value number of the specified value. Fails if
+/// Returns the value number of the specified value. Fails if
/// the value has not yet been numbered.
uint32_t ValueTable::lookup(Value *V) const {
DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
@@ -465,7 +466,7 @@ uint32_t ValueTable::lookup(Value *V) const {
return VI->second;
}
-/// lookup_or_add_cmp - Returns the value number of the given comparison,
+/// Returns the value number of the given comparison,
/// assigning it a new number if it did not have one before. Useful when
/// we deduced the result of a comparison, but don't immediately have an
/// instruction realizing that comparison to hand.
@@ -478,14 +479,14 @@ uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode,
return e;
}
-/// clear - Remove all entries from the ValueTable.
+/// Remove all entries from the ValueTable.
void ValueTable::clear() {
valueNumbering.clear();
expressionNumbering.clear();
nextValueNumber = 1;
}
-/// erase - Remove a value from the value numbering.
+/// Remove a value from the value numbering.
void ValueTable::erase(Value *V) {
valueNumbering.erase(V);
}
@@ -581,8 +582,8 @@ namespace {
return cast<MemIntrinsic>(Val.getPointer());
}
- /// MaterializeAdjustedValue - Emit code into this block to adjust the value
- /// defined here to the specified type. This handles various coercion cases.
+ /// Emit code into this block to adjust the value defined here to the
+ /// specified type. This handles various coercion cases.
Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const;
};
@@ -592,12 +593,12 @@ namespace {
DominatorTree *DT;
const DataLayout *DL;
const TargetLibraryInfo *TLI;
- AssumptionTracker *AT;
+ AssumptionCache *AC;
SetVector<BasicBlock *> DeadBlocks;
ValueTable VN;
- /// LeaderTable - A mapping from value numbers to lists of Value*'s that
+ /// A mapping from value numbers to lists of Value*'s that
/// have that value number. Use findLeader to query it.
struct LeaderTableEntry {
Value *Val;
@@ -622,7 +623,7 @@ namespace {
bool runOnFunction(Function &F) override;
- /// markInstructionForDeletion - This removes the specified instruction from
+ /// This removes the specified instruction from
/// our various maps and marks it for deletion.
void markInstructionForDeletion(Instruction *I) {
VN.erase(I);
@@ -634,8 +635,7 @@ namespace {
AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
MemoryDependenceAnalysis &getMemDep() const { return *MD; }
private:
- /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for
- /// its value number.
+ /// Push a new Value to the LeaderTable onto the list for its value number.
void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
LeaderTableEntry &Curr = LeaderTable[N];
if (!Curr.Val) {
@@ -651,7 +651,7 @@ namespace {
Curr.Next = Node;
}
- /// removeFromLeaderTable - Scan the list of values corresponding to a given
+ /// Scan the list of values corresponding to a given
/// value number, and remove the given instruction if encountered.
void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
LeaderTableEntry* Prev = nullptr;
@@ -682,9 +682,9 @@ namespace {
// This transformation requires dominator postdominator info
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionTracker>();
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
if (!NoLoads)
AU.addRequired<MemoryDependenceAnalysis>();
AU.addRequired<AliasAnalysis>();
@@ -709,6 +709,9 @@ namespace {
void dump(DenseMap<uint32_t, Value*> &d);
bool iterateOnFunction(Function &F);
bool performPRE(Function &F);
+ bool performScalarPRE(Instruction *I);
+ bool performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+ unsigned int ValNo);
Value *findLeader(const BasicBlock *BB, uint32_t num);
void cleanupGlobalSets();
void verifyRemoved(const Instruction *I) const;
@@ -725,16 +728,16 @@ namespace {
char GVN::ID = 0;
}
-// createGVNPass - The public interface to this file...
+// The public interface to this file...
FunctionPass *llvm::createGVNPass(bool NoLoads) {
return new GVN(NoLoads);
}
INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
@@ -750,7 +753,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
}
#endif
-/// IsValueFullyAvailableInBlock - Return true if we can prove that the value
+/// Return true if we can prove that the value
/// we're analyzing is fully available in the specified block. As we go, keep
/// track of which blocks we know are fully alive in FullyAvailableBlocks. This
/// map is actually a tri-state map with the following values:
@@ -796,7 +799,7 @@ static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
return true;
-// SpeculationFailure - If we get here, we found out that this is not, after
+// If we get here, we found out that this is not, after
// all, a fully-available block. We have a problem if we speculated on this and
// used the speculation to mark other blocks as available.
SpeculationFailure:
@@ -831,8 +834,7 @@ SpeculationFailure:
}
-/// CanCoerceMustAliasedValueToLoad - Return true if
-/// CoerceAvailableValueToLoadType will succeed.
+/// Return true if CoerceAvailableValueToLoadType will succeed.
static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
Type *LoadTy,
const DataLayout &DL) {
@@ -851,7 +853,7 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
return true;
}
-/// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and
+/// If we saw a store of a value to memory, and
/// then a load from a must-aliased pointer of a different type, try to coerce
/// the stored value. LoadedTy is the type of the load we want to replace and
/// InsertPt is the place to insert new instructions.
@@ -936,7 +938,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt);
}
-/// AnalyzeLoadFromClobberingWrite - This function is called when we have a
+/// This function is called when we have a
/// memdep query of a load that ends up being a clobbering memory write (store,
/// memset, memcpy, memmove). This means that the write *may* provide bits used
/// by the load but we can't be sure because the pointers don't mustalias.
@@ -1016,7 +1018,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
return LoadOffset-StoreOffset;
}
-/// AnalyzeLoadFromClobberingStore - This function is called when we have a
+/// This function is called when we have a
/// memdep query of a load that ends up being a clobbering store.
static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
StoreInst *DepSI,
@@ -1032,7 +1034,7 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
StorePtr, StoreSize, DL);
}
-/// AnalyzeLoadFromClobberingLoad - This function is called when we have a
+/// This function is called when we have a
/// memdep query of a load that ends up being clobbered by another load. See if
/// the other load can feed into the second load.
static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
@@ -1108,7 +1110,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
}
-/// GetStoreValueForLoad - This function is called when we have a
+/// This function is called when we have a
/// memdep query of a load that ends up being a clobbering store. This means
/// that the store provides bits used by the load but we the pointers don't
/// mustalias. Check this case to see if there is anything more we can do
@@ -1147,7 +1149,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, DL);
}
-/// GetLoadValueForLoad - This function is called when we have a
+/// This function is called when we have a
/// memdep query of a load that ends up being a clobbering load. This means
/// that the load *may* provide bits used by the load but we can't be sure
/// because the pointers don't mustalias. Check this case to see if there is
@@ -1210,7 +1212,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
}
-/// GetMemInstValueForLoad - This function is called when we have a
+/// This function is called when we have a
/// memdep query of a load that ends up being a clobbering mem intrinsic.
static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
Type *LoadTy, Instruction *InsertPt,
@@ -1267,7 +1269,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
}
-/// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
+/// Given a set of loads specified by ValuesPerBlock,
/// construct SSA form, allowing us to eliminate LI. This returns the value
/// that should be used at LI's definition site.
static Value *ConstructSSAForLoadSet(LoadInst *LI,
@@ -1621,7 +1623,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
// If all preds have a single successor, then we know it is safe to insert
// the load on the pred (?!?), so we can insert code to materialize the
// pointer if it is not available.
- PHITransAddr Address(LI->getPointerOperand(), DL, AT);
+ PHITransAddr Address(LI->getPointerOperand(), DL, AC);
Value *LoadPtr = nullptr;
LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
*DT, NewInsts);
@@ -1702,13 +1704,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
return true;
}
-/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
+/// Attempt to eliminate a load whose dependencies are
/// non-local by performing PHI construction.
bool GVN::processNonLocalLoad(LoadInst *LI) {
// Step 1: Find the non-local dependencies of the load.
LoadDepVect Deps;
- AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI);
- MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps);
+ MD->getNonLocalPointerDependency(LI, Deps);
// If we had to process more than one hundred blocks to find the
// dependencies, this load isn't worth worrying about. Optimizing
@@ -1729,6 +1730,15 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
return false;
}
+ // If this load follows a GEP, see if we can PRE the indices before analyzing.
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
+ for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
+ OE = GEP->idx_end();
+ OI != OE; ++OI)
+ if (Instruction *I = dyn_cast<Instruction>(OI->get()))
+ performScalarPRE(I);
+ }
+
// Step 2: Analyze the availability of the load
AvailValInBlkVect ValuesPerBlock;
UnavailBlkVect UnavailableBlocks;
@@ -1807,7 +1817,7 @@ static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
I->replaceAllUsesWith(Repl);
}
-/// processLoad - Attempt to eliminate a load, first by eliminating it
+/// Attempt to eliminate a load, first by eliminating it
/// locally, and then attempting non-local elimination if that fails.
bool GVN::processLoad(LoadInst *L) {
if (!MD)
@@ -2006,7 +2016,7 @@ bool GVN::processLoad(LoadInst *L) {
return false;
}
-// findLeader - In order to find a leader for a given value number at a
+// In order to find a leader for a given value number at a
// specific basic block, we first obtain the list of all Values for that number,
// and then scan the list to find one whose block dominates the block in
// question. This is fast because dominator tree queries consist of only
@@ -2034,9 +2044,8 @@ Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
return Val;
}
-/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the
-/// use is dominated by the given basic block. Returns the number of uses that
-/// were replaced.
+/// Replace all uses of 'From' with 'To' if the use is dominated by the given
+/// basic block. Returns the number of uses that were replaced.
unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
const BasicBlockEdge &Root) {
unsigned Count = 0;
@@ -2052,7 +2061,7 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
return Count;
}
-/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'. Return
+/// There is an edge from 'Src' to 'Dst'. Return
/// true if every path from the entry block to 'Dst' passes via this edge. In
/// particular 'Dst' must not be reachable via another edge from 'Src'.
static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
@@ -2069,7 +2078,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
return Pred != nullptr;
}
-/// propagateEquality - The given values are known to be equal in every block
+/// The given values are known to be equal in every block
/// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with
/// 'RHS' everywhere in the scope. Returns whether a change was made.
bool GVN::propagateEquality(Value *LHS, Value *RHS,
@@ -2096,15 +2105,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
std::swap(LHS, RHS);
assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
- // If there is no obvious reason to prefer the left-hand side over the right-
- // hand side, ensure the longest lived term is on the right-hand side, so the
- // shortest lived term will be replaced by the longest lived. This tends to
- // expose more simplifications.
+ // If there is no obvious reason to prefer the left-hand side over the
+ // right-hand side, ensure the longest lived term is on the right-hand side,
+ // so the shortest lived term will be replaced by the longest lived.
+ // This tends to expose more simplifications.
uint32_t LVN = VN.lookup_or_add(LHS);
if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
(isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
- // Move the 'oldest' value to the right-hand side, using the value number as
- // a proxy for age.
+ // Move the 'oldest' value to the right-hand side, using the value number
+ // as a proxy for age.
uint32_t RVN = VN.lookup_or_add(RHS);
if (LVN < RVN) {
std::swap(LHS, RHS);
@@ -2133,10 +2142,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
NumGVNEqProp += NumReplacements;
}
- // Now try to deduce additional equalities from this one. For example, if the
- // known equality was "(A != B)" == "false" then it follows that A and B are
- // equal in the scope. Only boolean equalities with an explicit true or false
- // RHS are currently supported.
+ // Now try to deduce additional equalities from this one. For example, if
+ // the known equality was "(A != B)" == "false" then it follows that A and B
+ // are equal in the scope. Only boolean equalities with an explicit true or
+ // false RHS are currently supported.
if (!RHS->getType()->isIntegerTy(1))
// Not a boolean equality - bail out.
continue;
@@ -2161,7 +2170,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
// If we are propagating an equality like "(A == B)" == "true" then also
// propagate the equality A == B. When propagating a comparison such as
// "(A >= B)" == "true", replace all instances of "A < B" with "false".
- if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
+ if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
// If "A == B" is known true, or "A != B" is known false, then replace
@@ -2170,12 +2179,28 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
(isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
Worklist.push_back(std::make_pair(Op0, Op1));
+ // Handle the floating point versions of equality comparisons too.
+ if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
+ (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) {
+
+ // Floating point -0.0 and 0.0 compare equal, so we can only
+ // propagate values if we know that we have a constant and that
+ // its value is non-zero.
+
+ // FIXME: We should do this optimization if 'no signed zeros' is
+ // applicable via an instruction-level fast-math-flag or some other
+ // indicator that relaxed FP semantics are being used.
+
+ if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero())
+ Worklist.push_back(std::make_pair(Op0, Op1));
+ }
+
// If "A >= B" is known true, replace "A < B" with false everywhere.
CmpInst::Predicate NotPred = Cmp->getInversePredicate();
Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
- // Since we don't have the instruction "A < B" immediately to hand, work out
- // the value number that it would have and use that to find an appropriate
- // instruction (if any).
+ // Since we don't have the instruction "A < B" immediately to hand, work
+ // out the value number that it would have and use that to find an
+ // appropriate instruction (if any).
uint32_t NextNum = VN.getNextUnusedValueNumber();
uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
// If the number we were assigned was brand new then there is no point in
@@ -2203,7 +2228,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
return Changed;
}
-/// processInstruction - When calculating availability, handle an instruction
+/// When calculating availability, handle an instruction
/// by inserting it into the appropriate sets
bool GVN::processInstruction(Instruction *I) {
// Ignore dbg info intrinsics.
@@ -2214,7 +2239,7 @@ bool GVN::processInstruction(Instruction *I) {
// to value numbering it. Value numbering often exposes redundancies, for
// example if it determines that %y is equal to %x then the instruction
// "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
- if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AT)) {
+ if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
I->replaceAllUsesWith(V);
if (MD && V->getType()->getScalarType()->isPointerTy())
MD->invalidateCachedPointerInfo(V);
@@ -2334,8 +2359,8 @@ bool GVN::runOnFunction(Function& F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
- AT = &getAnalysis<AssumptionTracker>();
- TLI = &getAnalysis<TargetLibraryInfo>();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
VN.setMemDep(MD);
VN.setDomTree(DT);
@@ -2348,7 +2373,8 @@ bool GVN::runOnFunction(Function& F) {
for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
BasicBlock *BB = FI++;
- bool removedBlock = MergeBlockIntoPredecessor(BB, this);
+ bool removedBlock = MergeBlockIntoPredecessor(
+ BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD);
if (removedBlock) ++NumGVNBlocks;
Changed |= removedBlock;
@@ -2431,175 +2457,204 @@ bool GVN::processBlock(BasicBlock *BB) {
return ChangedFunction;
}
-/// performPRE - Perform a purely local form of PRE that looks for diamond
-/// control flow patterns and attempts to perform simple PRE at the join point.
-bool GVN::performPRE(Function &F) {
- bool Changed = false;
+// Instantiate an expression in a predecessor that lacked it.
+bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+ unsigned int ValNo) {
+ // Because we are going top-down through the block, all value numbers
+ // will be available in the predecessor by the time we need them. Any
+ // that weren't originally present will have been instantiated earlier
+ // in this loop.
+ bool success = true;
+ for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) {
+ Value *Op = Instr->getOperand(i);
+ if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+ continue;
+
+ if (Value *V = findLeader(Pred, VN.lookup(Op))) {
+ Instr->setOperand(i, V);
+ } else {
+ success = false;
+ break;
+ }
+ }
+
+ // Fail out if we encounter an operand that is not available in
+ // the PRE predecessor. This is typically because of loads which
+ // are not value numbered precisely.
+ if (!success)
+ return false;
+
+ Instr->insertBefore(Pred->getTerminator());
+ Instr->setName(Instr->getName() + ".pre");
+ Instr->setDebugLoc(Instr->getDebugLoc());
+ VN.add(Instr, ValNo);
+
+ // Update the availability map to include the new instruction.
+ addToLeaderTable(ValNo, Instr, Pred);
+ return true;
+}
+
+bool GVN::performScalarPRE(Instruction *CurInst) {
SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
- for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
- // Nothing to PRE in the entry block.
- if (CurrentBlock == &F.getEntryBlock()) continue;
- // Don't perform PRE on a landing pad.
- if (CurrentBlock->isLandingPad()) continue;
+ if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
+ isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
+ CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+ isa<DbgInfoIntrinsic>(CurInst))
+ return false;
- for (BasicBlock::iterator BI = CurrentBlock->begin(),
- BE = CurrentBlock->end(); BI != BE; ) {
- Instruction *CurInst = BI++;
+ // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+ // sinking the compare again, and it would force the code generator to
+ // move the i1 from processor flags or predicate registers into a general
+ // purpose register.
+ if (isa<CmpInst>(CurInst))
+ return false;
- if (isa<AllocaInst>(CurInst) ||
- isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) ||
- CurInst->getType()->isVoidTy() ||
- CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
- isa<DbgInfoIntrinsic>(CurInst))
- continue;
+ // We don't currently value number ANY inline asm calls.
+ if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
+ if (CallI->isInlineAsm())
+ return false;
- // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
- // sinking the compare again, and it would force the code generator to
- // move the i1 from processor flags or predicate registers into a general
- // purpose register.
- if (isa<CmpInst>(CurInst))
- continue;
+ uint32_t ValNo = VN.lookup(CurInst);
+
+ // Look for the predecessors for PRE opportunities. We're
+ // only trying to solve the basic diamond case, where
+ // a value is computed in the successor and one predecessor,
+ // but not the other. We also explicitly disallow cases
+ // where the successor is its own predecessor, because they're
+ // more complicated to get right.
+ unsigned NumWith = 0;
+ unsigned NumWithout = 0;
+ BasicBlock *PREPred = nullptr;
+ BasicBlock *CurrentBlock = CurInst->getParent();
+ predMap.clear();
+
+ for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock);
+ PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ // We're not interested in PRE where the block is its
+ // own predecessor, or in blocks with predecessors
+ // that are not reachable.
+ if (P == CurrentBlock) {
+ NumWithout = 2;
+ break;
+ } else if (!DT->isReachableFromEntry(P)) {
+ NumWithout = 2;
+ break;
+ }
- // We don't currently value number ANY inline asm calls.
- if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
- if (CallI->isInlineAsm())
- continue;
+ Value *predV = findLeader(P, ValNo);
+ if (!predV) {
+ predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+ PREPred = P;
+ ++NumWithout;
+ } else if (predV == CurInst) {
+ /* CurInst dominates this predecessor. */
+ NumWithout = 2;
+ break;
+ } else {
+ predMap.push_back(std::make_pair(predV, P));
+ ++NumWith;
+ }
+ }
- uint32_t ValNo = VN.lookup(CurInst);
-
- // Look for the predecessors for PRE opportunities. We're
- // only trying to solve the basic diamond case, where
- // a value is computed in the successor and one predecessor,
- // but not the other. We also explicitly disallow cases
- // where the successor is its own predecessor, because they're
- // more complicated to get right.
- unsigned NumWith = 0;
- unsigned NumWithout = 0;
- BasicBlock *PREPred = nullptr;
- predMap.clear();
-
- for (pred_iterator PI = pred_begin(CurrentBlock),
- PE = pred_end(CurrentBlock); PI != PE; ++PI) {
- BasicBlock *P = *PI;
- // We're not interested in PRE where the block is its
- // own predecessor, or in blocks with predecessors
- // that are not reachable.
- if (P == CurrentBlock) {
- NumWithout = 2;
- break;
- } else if (!DT->isReachableFromEntry(P)) {
- NumWithout = 2;
- break;
- }
+ // Don't do PRE when it might increase code size, i.e. when
+ // we would need to insert instructions in more than one pred.
+ if (NumWithout > 1 || NumWith == 0)
+ return false;
- Value* predV = findLeader(P, ValNo);
- if (!predV) {
- predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
- PREPred = P;
- ++NumWithout;
- } else if (predV == CurInst) {
- /* CurInst dominates this predecessor. */
- NumWithout = 2;
- break;
- } else {
- predMap.push_back(std::make_pair(predV, P));
- ++NumWith;
- }
- }
+ // We may have a case where all predecessors have the instruction,
+ // and we just need to insert a phi node. Otherwise, perform
+ // insertion.
+ Instruction *PREInstr = nullptr;
- // Don't do PRE when it might increase code size, i.e. when
- // we would need to insert instructions in more than one pred.
- if (NumWithout != 1 || NumWith == 0)
- continue;
+ if (NumWithout != 0) {
+ // Don't do PRE across indirect branch.
+ if (isa<IndirectBrInst>(PREPred->getTerminator()))
+ return false;
- // Don't do PRE across indirect branch.
- if (isa<IndirectBrInst>(PREPred->getTerminator()))
- continue;
+ // We can't do PRE safely on a critical edge, so instead we schedule
+ // the edge to be split and perform the PRE the next time we iterate
+ // on the function.
+ unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+ if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+ toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+ return false;
+ }
+ // We need to insert somewhere, so let's give it a shot
+ PREInstr = CurInst->clone();
+ if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
+ // If we failed insertion, make sure we remove the instruction.
+ DEBUG(verifyRemoved(PREInstr));
+ delete PREInstr;
+ return false;
+ }
+ }
- // We can't do PRE safely on a critical edge, so instead we schedule
- // the edge to be split and perform the PRE the next time we iterate
- // on the function.
- unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
- if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
- toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
- continue;
- }
+ // Either we should have filled in the PRE instruction, or we should
+ // not have needed insertions.
+ assert (PREInstr != nullptr || NumWithout == 0);
- // Instantiate the expression in the predecessor that lacked it.
- // Because we are going top-down through the block, all value numbers
- // will be available in the predecessor by the time we need them. Any
- // that weren't originally present will have been instantiated earlier
- // in this loop.
- Instruction *PREInstr = CurInst->clone();
- bool success = true;
- for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) {
- Value *Op = PREInstr->getOperand(i);
- if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
- continue;
+ ++NumGVNPRE;
- if (Value *V = findLeader(PREPred, VN.lookup(Op))) {
- PREInstr->setOperand(i, V);
- } else {
- success = false;
- break;
- }
- }
+ // Create a PHI to make the value available in this block.
+ PHINode *Phi =
+ PHINode::Create(CurInst->getType(), predMap.size(),
+ CurInst->getName() + ".pre-phi", CurrentBlock->begin());
+ for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+ if (Value *V = predMap[i].first)
+ Phi->addIncoming(V, predMap[i].second);
+ else
+ Phi->addIncoming(PREInstr, PREPred);
+ }
+
+ VN.add(Phi, ValNo);
+ addToLeaderTable(ValNo, Phi, CurrentBlock);
+ Phi->setDebugLoc(CurInst->getDebugLoc());
+ CurInst->replaceAllUsesWith(Phi);
+ if (Phi->getType()->getScalarType()->isPointerTy()) {
+ // Because we have added a PHI-use of the pointer value, it has now
+ // "escaped" from alias analysis' perspective. We need to inform
+ // AA of this.
+ for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) {
+ unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
+ VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
+ }
- // Fail out if we encounter an operand that is not available in
- // the PRE predecessor. This is typically because of loads which
- // are not value numbered precisely.
- if (!success) {
- DEBUG(verifyRemoved(PREInstr));
- delete PREInstr;
- continue;
- }
+ if (MD)
+ MD->invalidateCachedPointerInfo(Phi);
+ }
+ VN.erase(CurInst);
+ removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
- PREInstr->insertBefore(PREPred->getTerminator());
- PREInstr->setName(CurInst->getName() + ".pre");
- PREInstr->setDebugLoc(CurInst->getDebugLoc());
- VN.add(PREInstr, ValNo);
- ++NumGVNPRE;
-
- // Update the availability map to include the new instruction.
- addToLeaderTable(ValNo, PREInstr, PREPred);
-
- // Create a PHI to make the value available in this block.
- PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(),
- CurInst->getName() + ".pre-phi",
- CurrentBlock->begin());
- for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
- if (Value *V = predMap[i].first)
- Phi->addIncoming(V, predMap[i].second);
- else
- Phi->addIncoming(PREInstr, PREPred);
- }
+ DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+ if (MD)
+ MD->removeInstruction(CurInst);
+ DEBUG(verifyRemoved(CurInst));
+ CurInst->eraseFromParent();
+ ++NumGVNInstr;
+
+ return true;
+}
- VN.add(Phi, ValNo);
- addToLeaderTable(ValNo, Phi, CurrentBlock);
- Phi->setDebugLoc(CurInst->getDebugLoc());
- CurInst->replaceAllUsesWith(Phi);
- if (Phi->getType()->getScalarType()->isPointerTy()) {
- // Because we have added a PHI-use of the pointer value, it has now
- // "escaped" from alias analysis' perspective. We need to inform
- // AA of this.
- for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee;
- ++ii) {
- unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
- VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
- }
+/// Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+ bool Changed = false;
+ for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
+ // Nothing to PRE in the entry block.
+ if (CurrentBlock == &F.getEntryBlock())
+ continue;
- if (MD)
- MD->invalidateCachedPointerInfo(Phi);
- }
- VN.erase(CurInst);
- removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+ // Don't perform PRE on a landing pad.
+ if (CurrentBlock->isLandingPad())
+ continue;
- DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
- if (MD) MD->removeInstruction(CurInst);
- DEBUG(verifyRemoved(CurInst));
- CurInst->eraseFromParent();
- Changed = true;
+ for (BasicBlock::iterator BI = CurrentBlock->begin(),
+ BE = CurrentBlock->end();
+ BI != BE;) {
+ Instruction *CurInst = BI++;
+ Changed = performScalarPRE(CurInst);
}
}
@@ -2612,50 +2667,48 @@ bool GVN::performPRE(Function &F) {
/// Split the critical edge connecting the given two blocks, and return
/// the block inserted to the critical edge.
BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
- BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this);
+ BasicBlock *BB = SplitCriticalEdge(
+ Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT));
if (MD)
MD->invalidateCachedPredecessors();
return BB;
}
-/// splitCriticalEdges - Split critical edges found during the previous
+/// Split critical edges found during the previous
/// iteration that may enable further optimization.
bool GVN::splitCriticalEdges() {
if (toSplit.empty())
return false;
do {
std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
- SplitCriticalEdge(Edge.first, Edge.second, this);
+ SplitCriticalEdge(Edge.first, Edge.second,
+ CriticalEdgeSplittingOptions(getAliasAnalysis(), DT));
} while (!toSplit.empty());
if (MD) MD->invalidateCachedPredecessors();
return true;
}
-/// iterateOnFunction - Executes one iteration of GVN
+/// Executes one iteration of GVN
bool GVN::iterateOnFunction(Function &F) {
cleanupGlobalSets();
// Top-down walk of the dominator tree
bool Changed = false;
-#if 0
- // Needed for value numbering with phi construction to work.
- ReversePostOrderTraversal<Function*> RPOT(&F);
- for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(),
- RE = RPOT.end(); RI != RE; ++RI)
- Changed |= processBlock(*RI);
-#else
// Save the blocks this function have before transformation begins. GVN may
// split critical edge, and hence may invalidate the RPO/DT iterator.
//
std::vector<BasicBlock *> BBVect;
BBVect.reserve(256);
- for (DomTreeNode *X : depth_first(DT->getRootNode()))
- BBVect.push_back(X->getBlock());
+ // Needed for value numbering with phi construction to work.
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
+ RE = RPOT.end();
+ RI != RE; ++RI)
+ BBVect.push_back(*RI);
for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
I != E; I++)
Changed |= processBlock(*I);
-#endif
return Changed;
}
@@ -2666,7 +2719,7 @@ void GVN::cleanupGlobalSets() {
TableAllocator.Reset();
}
-/// verifyRemoved - Verify that the specified instruction does not occur in our
+/// Verify that the specified instruction does not occur in our
/// internal data structures.
void GVN::verifyRemoved(const Instruction *Inst) const {
VN.verifyRemoved(Inst);
@@ -2685,11 +2738,10 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
}
}
-// BB is declared dead, which implied other blocks become dead as well. This
-// function is to add all these blocks to "DeadBlocks". For the dead blocks'
-// live successors, update their phi nodes by replacing the operands
-// corresponding to dead blocks with UndefVal.
-//
+/// BB is declared dead, which implied other blocks become dead as well. This
+/// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+/// live successors, update their phi nodes by replacing the operands
+/// corresponding to dead blocks with UndefVal.
void GVN::addDeadBlock(BasicBlock *BB) {
SmallVector<BasicBlock *, 4> NewDead;
SmallSetVector<BasicBlock *, 4> DF;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index c01f57f..f99ebbc 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -44,7 +44,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SimplifyIndVar.h"
@@ -91,7 +91,7 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<ScalarEvolution>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
@@ -126,7 +126,7 @@ char IndVarSimplify::ID = 0;
INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
"Induction Variable Simplification", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -1929,13 +1929,15 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
if (!L->isLoopSimplifyForm())
return false;
- LI = &getAnalysis<LoopInfo>();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolution>();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
- TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
- TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+ TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
DeadInsts.clear();
Changed = false;
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
new file mode 100644
index 0000000..8559e63
--- /dev/null
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -0,0 +1,1422 @@
+//===-- InductiveRangeCheckElimination.cpp - ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The InductiveRangeCheckElimination pass splits a loop's iteration space into
+// three disjoint ranges. It does that in a way such that the loop running in
+// the middle loop provably does not need range checks. As an example, it will
+// convert
+//
+// len = < known positive >
+// for (i = 0; i < n; i++) {
+// if (0 <= i && i < len) {
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+//
+// to
+//
+// len = < known positive >
+// limit = smin(n, len)
+// // no first segment
+// for (i = 0; i < limit; i++) {
+// if (0 <= i && i < len) { // this check is fully redundant
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+// for (i = limit; i < n; i++) {
+// if (0 <= i && i < len) {
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+
+#include "llvm/Support/Debug.h"
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+
+#include "llvm/Pass.h"
+
+#include <array>
+
+using namespace llvm;
+
+static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
+ cl::init(64));
+
+static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
+ cl::init(false));
+
+static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
+ cl::Hidden, cl::init(10));
+
+#define DEBUG_TYPE "irce"
+
+namespace {
+
+/// An inductive range check is conditional branch in a loop with
+///
+/// 1. a very cold successor (i.e. the branch jumps to that successor very
+/// rarely)
+///
+/// and
+///
+/// 2. a condition that is provably true for some range of values taken by the
+/// containing loop's induction variable.
+///
+/// Currently all inductive range checks are branches conditional on an
+/// expression of the form
+///
+/// 0 <= (Offset + Scale * I) < Length
+///
+/// where `I' is the canonical induction variable of a loop to which Offset and
+/// Scale are loop invariant, and Length is >= 0. Currently the 'false' branch
+/// is considered cold, looking at profiling data to verify that is a TODO.
+
+class InductiveRangeCheck {
+ const SCEV *Offset;
+ const SCEV *Scale;
+ Value *Length;
+ BranchInst *Branch;
+
+ InductiveRangeCheck() :
+ Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { }
+
+public:
+ const SCEV *getOffset() const { return Offset; }
+ const SCEV *getScale() const { return Scale; }
+ Value *getLength() const { return Length; }
+
+ void print(raw_ostream &OS) const {
+ OS << "InductiveRangeCheck:\n";
+ OS << " Offset: ";
+ Offset->print(OS);
+ OS << " Scale: ";
+ Scale->print(OS);
+ OS << " Length: ";
+ Length->print(OS);
+ OS << " Branch: ";
+ getBranch()->print(OS);
+ OS << "\n";
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() {
+ print(dbgs());
+ }
+#endif
+
+ BranchInst *getBranch() const { return Branch; }
+
+ /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If
+ /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
+
+ class Range {
+ const SCEV *Begin;
+ const SCEV *End;
+
+ public:
+ Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) {
+ assert(Begin->getType() == End->getType() && "ill-typed range!");
+ }
+
+ Type *getType() const { return Begin->getType(); }
+ const SCEV *getBegin() const { return Begin; }
+ const SCEV *getEnd() const { return End; }
+ };
+
+ typedef SpecificBumpPtrAllocator<InductiveRangeCheck> AllocatorTy;
+
+ /// This is the value the condition of the branch needs to evaluate to for the
+ /// branch to take the hot successor (see (1) above).
+ bool getPassingDirection() { return true; }
+
+ /// Computes a range for the induction variable (IndVar) in which the range
+ /// check is redundant and can be constant-folded away. The induction
+ /// variable is not required to be the canonical {0,+,1} induction variable.
+ Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
+ const SCEVAddRecExpr *IndVar,
+ IRBuilder<> &B) const;
+
+ /// Create an inductive range check out of BI if possible, else return
+ /// nullptr.
+ static InductiveRangeCheck *create(AllocatorTy &Alloc, BranchInst *BI,
+ Loop *L, ScalarEvolution &SE,
+ BranchProbabilityInfo &BPI);
+};
+
+class InductiveRangeCheckElimination : public LoopPass {
+ InductiveRangeCheck::AllocatorTy Allocator;
+
+public:
+ static char ID;
+ InductiveRangeCheckElimination() : LoopPass(ID) {
+ initializeInductiveRangeCheckEliminationPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addRequired<ScalarEvolution>();
+ AU.addRequired<BranchProbabilityInfo>();
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+char InductiveRangeCheckElimination::ID = 0;
+}
+
+INITIALIZE_PASS(InductiveRangeCheckElimination, "irce",
+ "Inductive range check elimination", false, false)
+
+static bool IsLowerBoundCheck(Value *Check, Value *&IndexV) {
+ using namespace llvm::PatternMatch;
+
+ ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+ Value *LHS = nullptr, *RHS = nullptr;
+
+ if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
+ return false;
+
+ switch (Pred) {
+ default:
+ return false;
+
+ case ICmpInst::ICMP_SLE:
+ std::swap(LHS, RHS);
+ // fallthrough
+ case ICmpInst::ICMP_SGE:
+ if (!match(RHS, m_ConstantInt<0>()))
+ return false;
+ IndexV = LHS;
+ return true;
+
+ case ICmpInst::ICMP_SLT:
+ std::swap(LHS, RHS);
+ // fallthrough
+ case ICmpInst::ICMP_SGT:
+ if (!match(RHS, m_ConstantInt<-1>()))
+ return false;
+ IndexV = LHS;
+ return true;
+ }
+}
+
+static bool IsUpperBoundCheck(Value *Check, Value *Index, Value *&UpperLimit) {
+ using namespace llvm::PatternMatch;
+
+ ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+ Value *LHS = nullptr, *RHS = nullptr;
+
+ if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
+ return false;
+
+ switch (Pred) {
+ default:
+ return false;
+
+ case ICmpInst::ICMP_SGT:
+ std::swap(LHS, RHS);
+ // fallthrough
+ case ICmpInst::ICMP_SLT:
+ if (LHS != Index)
+ return false;
+ UpperLimit = RHS;
+ return true;
+
+ case ICmpInst::ICMP_UGT:
+ std::swap(LHS, RHS);
+ // fallthrough
+ case ICmpInst::ICMP_ULT:
+ if (LHS != Index)
+ return false;
+ UpperLimit = RHS;
+ return true;
+ }
+}
+
+/// Split a condition into something semantically equivalent to (0 <= I <
+/// Limit), both comparisons signed and Len loop invariant on L and positive.
+/// On success, return true and set Index to I and UpperLimit to Limit. Return
+/// false on failure (we may still write to UpperLimit and Index on failure).
+/// It does not try to interpret I as a loop index.
+///
+static bool SplitRangeCheckCondition(Loop *L, ScalarEvolution &SE,
+ Value *Condition, const SCEV *&Index,
+ Value *&UpperLimit) {
+
+ // TODO: currently this catches some silly cases like comparing "%idx slt 1".
+ // Our transformations are still correct, but less likely to be profitable in
+ // those cases. We have to come up with some heuristics that pick out the
+ // range checks that are more profitable to clone a loop for. This function
+ // in general can be made more robust.
+
+ using namespace llvm::PatternMatch;
+
+ Value *A = nullptr;
+ Value *B = nullptr;
+ ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+
+ // In these early checks we assume that the matched UpperLimit is positive.
+ // We'll verify that fact later, before returning true.
+
+ if (match(Condition, m_And(m_Value(A), m_Value(B)))) {
+ Value *IndexV = nullptr;
+ Value *ExpectedUpperBoundCheck = nullptr;
+
+ if (IsLowerBoundCheck(A, IndexV))
+ ExpectedUpperBoundCheck = B;
+ else if (IsLowerBoundCheck(B, IndexV))
+ ExpectedUpperBoundCheck = A;
+ else
+ return false;
+
+ if (!IsUpperBoundCheck(ExpectedUpperBoundCheck, IndexV, UpperLimit))
+ return false;
+
+ Index = SE.getSCEV(IndexV);
+
+ if (isa<SCEVCouldNotCompute>(Index))
+ return false;
+
+ } else if (match(Condition, m_ICmp(Pred, m_Value(A), m_Value(B)))) {
+ switch (Pred) {
+ default:
+ return false;
+
+ case ICmpInst::ICMP_SGT:
+ std::swap(A, B);
+ // fall through
+ case ICmpInst::ICMP_SLT:
+ UpperLimit = B;
+ Index = SE.getSCEV(A);
+ if (isa<SCEVCouldNotCompute>(Index) || !SE.isKnownNonNegative(Index))
+ return false;
+ break;
+
+ case ICmpInst::ICMP_UGT:
+ std::swap(A, B);
+ // fall through
+ case ICmpInst::ICMP_ULT:
+ UpperLimit = B;
+ Index = SE.getSCEV(A);
+ if (isa<SCEVCouldNotCompute>(Index))
+ return false;
+ break;
+ }
+ } else {
+ return false;
+ }
+
+ const SCEV *UpperLimitSCEV = SE.getSCEV(UpperLimit);
+ if (isa<SCEVCouldNotCompute>(UpperLimitSCEV) ||
+ !SE.isKnownNonNegative(UpperLimitSCEV))
+ return false;
+
+ if (SE.getLoopDisposition(UpperLimitSCEV, L) !=
+ ScalarEvolution::LoopInvariant) {
+ DEBUG(dbgs() << " in function: " << L->getHeader()->getParent()->getName()
+ << " ";
+ dbgs() << " UpperLimit is not loop invariant: "
+ << UpperLimit->getName() << "\n";);
+ return false;
+ }
+
+ return true;
+}
+
+
+InductiveRangeCheck *
+InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI,
+ Loop *L, ScalarEvolution &SE,
+ BranchProbabilityInfo &BPI) {
+
+ if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
+ return nullptr;
+
+ BranchProbability LikelyTaken(15, 16);
+
+ if (BPI.getEdgeProbability(BI->getParent(), (unsigned) 0) < LikelyTaken)
+ return nullptr;
+
+ Value *Length = nullptr;
+ const SCEV *IndexSCEV = nullptr;
+
+ if (!SplitRangeCheckCondition(L, SE, BI->getCondition(), IndexSCEV, Length))
+ return nullptr;
+
+ assert(IndexSCEV && Length && "contract with SplitRangeCheckCondition!");
+
+ const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV);
+ bool IsAffineIndex =
+ IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
+
+ if (!IsAffineIndex)
+ return nullptr;
+
+ InductiveRangeCheck *IRC = new (A.Allocate()) InductiveRangeCheck;
+ IRC->Length = Length;
+ IRC->Offset = IndexAddRec->getStart();
+ IRC->Scale = IndexAddRec->getStepRecurrence(SE);
+ IRC->Branch = BI;
+ return IRC;
+}
+
+namespace {
+
+// Keeps track of the structure of a loop. This is similar to llvm::Loop,
+// except that it is more lightweight and can track the state of a loop through
+// changing and potentially invalid IR. This structure also formalizes the
+// kinds of loops we can deal with -- ones that have a single latch that is also
+// an exiting block *and* have a canonical induction variable.
+struct LoopStructure {
+ const char *Tag;
+
+ BasicBlock *Header;
+ BasicBlock *Latch;
+
+ // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
+ // successor is `LatchExit', the exit block of the loop.
+ BranchInst *LatchBr;
+ BasicBlock *LatchExit;
+ unsigned LatchBrExitIdx;
+
+ Value *IndVarNext;
+ Value *IndVarStart;
+ Value *LoopExitAt;
+ bool IndVarIncreasing;
+
+ LoopStructure()
+ : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr),
+ LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr),
+ IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {}
+
+ template <typename M> LoopStructure map(M Map) const {
+ LoopStructure Result;
+ Result.Tag = Tag;
+ Result.Header = cast<BasicBlock>(Map(Header));
+ Result.Latch = cast<BasicBlock>(Map(Latch));
+ Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+ Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
+ Result.LatchBrExitIdx = LatchBrExitIdx;
+ Result.IndVarNext = Map(IndVarNext);
+ Result.IndVarStart = Map(IndVarStart);
+ Result.LoopExitAt = Map(LoopExitAt);
+ Result.IndVarIncreasing = IndVarIncreasing;
+ return Result;
+ }
+
+ static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
+ BranchProbabilityInfo &BPI,
+ Loop &,
+ const char *&);
+};
+
+/// This class is used to constrain loops to run within a given iteration space.
+/// The algorithm this class implements is given a Loop and a range [Begin,
+/// End). The algorithm then tries to break out a "main loop" out of the loop
+/// it is given in a way that the "main loop" runs with the induction variable
+/// in a subset of [Begin, End). The algorithm emits appropriate pre and post
+/// loops to run any remaining iterations. The pre loop runs any iterations in
+/// which the induction variable is < Begin, and the post loop runs any
+/// iterations in which the induction variable is >= End.
+///
+class LoopConstrainer {
+ // The representation of a clone of the original loop we started out with.
+ struct ClonedLoop {
+ // The cloned blocks
+ std::vector<BasicBlock *> Blocks;
+
+ // `Map` maps values in the clonee into values in the cloned version
+ ValueToValueMapTy Map;
+
+ // An instance of `LoopStructure` for the cloned loop
+ LoopStructure Structure;
+ };
+
+ // Result of rewriting the range of a loop. See changeIterationSpaceEnd for
+ // more details on what these fields mean.
+ struct RewrittenRangeInfo {
+ BasicBlock *PseudoExit;
+ BasicBlock *ExitSelector;
+ std::vector<PHINode *> PHIValuesAtPseudoExit;
+ PHINode *IndVarEnd;
+
+ RewrittenRangeInfo()
+ : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {}
+ };
+
+ // Calculated subranges we restrict the iteration space of the main loop to.
+ // See the implementation of `calculateSubRanges' for more details on how
+ // these fields are computed. `LowLimit` is None if there is no restriction
+ // on low end of the restricted iteration space of the main loop. `HighLimit`
+ // is None if there is no restriction on high end of the restricted iteration
+ // space of the main loop.
+
+ struct SubRanges {
+ Optional<const SCEV *> LowLimit;
+ Optional<const SCEV *> HighLimit;
+ };
+
+ // A utility function that does a `replaceUsesOfWith' on the incoming block
+ // set of a `PHINode' -- replaces instances of `Block' in the `PHINode's
+ // incoming block list with `ReplaceBy'.
+ static void replacePHIBlock(PHINode *PN, BasicBlock *Block,
+ BasicBlock *ReplaceBy);
+
+ // Compute a safe set of limits for the main loop to run in -- effectively the
+ // intersection of `Range' and the iteration space of the original loop.
+ // Return None if unable to compute the set of subranges.
+ //
+ Optional<SubRanges> calculateSubRanges() const;
+
+ // Clone `OriginalLoop' and return the result in CLResult. The IR after
+ // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
+ // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
+ // but there is no such edge.
+ //
+ void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+
+ // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
+ // iteration space of the rewritten loop ends at ExitLoopAt. The start of the
+ // iteration space is not changed. `ExitLoopAt' is assumed to be slt
+ // `OriginalHeaderCount'.
+ //
+ // If there are iterations left to execute, control is made to jump to
+ // `ContinuationBlock', otherwise they take the normal loop exit. The
+ // returned `RewrittenRangeInfo' object is populated as follows:
+ //
+ // .PseudoExit is a basic block that unconditionally branches to
+ // `ContinuationBlock'.
+ //
+ // .ExitSelector is a basic block that decides, on exit from the loop,
+ // whether to branch to the "true" exit or to `PseudoExit'.
+ //
+ // .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
+ // for each PHINode in the loop header on taking the pseudo exit.
+ //
+ // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
+ // preheader because it is made to branch to the loop header only
+ // conditionally.
+ //
+ RewrittenRangeInfo
+ changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
+ Value *ExitLoopAt,
+ BasicBlock *ContinuationBlock) const;
+
+ // The loop denoted by `LS' has `OldPreheader' as its preheader. This
+ // function creates a new preheader for `LS' and returns it.
+ //
+ BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
+ const char *Tag) const;
+
+ // `ContinuationBlockAndPreheader' was the continuation block for some call to
+ // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
+ // This function rewrites the PHI nodes in `LS.Header' to start with the
+ // correct value.
+ void rewriteIncomingValuesForPHIs(
+ LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
+ const LoopConstrainer::RewrittenRangeInfo &RRI) const;
+
+ // Even though we do not preserve any passes at this time, we at least need to
+ // keep the parent loop structure consistent. The `LPPassManager' seems to
+ // verify this after running a loop pass. This function adds the list of
+ // blocks denoted by BBs to this loops parent loop if required.
+ void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
+
+ // Some global state.
+ Function &F;
+ LLVMContext &Ctx;
+ ScalarEvolution &SE;
+
+ // Information about the original loop we started out with.
+ Loop &OriginalLoop;
+ LoopInfo &OriginalLoopInfo;
+ const SCEV *LatchTakenCount;
+ BasicBlock *OriginalPreheader;
+
+ // The preheader of the main loop. This may or may not be different from
+ // `OriginalPreheader'.
+ BasicBlock *MainLoopPreheader;
+
+ // The range we need to run the main loop in.
+ InductiveRangeCheck::Range Range;
+
+ // The structure of the main loop (see comment at the beginning of this class
+ // for a definition)
+ LoopStructure MainLoopStructure;
+
+public:
+ LoopConstrainer(Loop &L, LoopInfo &LI, const LoopStructure &LS,
+ ScalarEvolution &SE, InductiveRangeCheck::Range R)
+ : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
+ SE(SE), OriginalLoop(L), OriginalLoopInfo(LI), LatchTakenCount(nullptr),
+ OriginalPreheader(nullptr), MainLoopPreheader(nullptr), Range(R),
+ MainLoopStructure(LS) {}
+
+ // Entry point for the algorithm. Returns true on success.
+ bool run();
+};
+
+}
+
+void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
+ BasicBlock *ReplaceBy) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingBlock(i) == Block)
+ PN->setIncomingBlock(i, ReplaceBy);
+}
+
+static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) {
+ APInt SMax =
+ APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
+ return SE.getSignedRange(S).contains(SMax) &&
+ SE.getUnsignedRange(S).contains(SMax);
+}
+
+static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) {
+ APInt SMin =
+ APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth());
+ return SE.getSignedRange(S).contains(SMin) &&
+ SE.getUnsignedRange(S).contains(SMin);
+}
+
+Optional<LoopStructure>
+LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+ Loop &L, const char *&FailureReason) {
+ assert(L.isLoopSimplifyForm() && "should follow from addRequired<>");
+
+ BasicBlock *Latch = L.getLoopLatch();
+ if (!L.isLoopExiting(Latch)) {
+ FailureReason = "no loop latch";
+ return None;
+ }
+
+ BasicBlock *Header = L.getHeader();
+ BasicBlock *Preheader = L.getLoopPreheader();
+ if (!Preheader) {
+ FailureReason = "no preheader";
+ return None;
+ }
+
+ BranchInst *LatchBr = dyn_cast<BranchInst>(&*Latch->rbegin());
+ if (!LatchBr || LatchBr->isUnconditional()) {
+ FailureReason = "latch terminator not conditional branch";
+ return None;
+ }
+
+ unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
+
+ BranchProbability ExitProbability =
+ BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
+
+ if (ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
+ FailureReason = "short running loop, not profitable";
+ return None;
+ }
+
+ ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
+ if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
+ FailureReason = "latch terminator branch not conditional on integral icmp";
+ return None;
+ }
+
+ const SCEV *LatchCount = SE.getExitCount(&L, Latch);
+ if (isa<SCEVCouldNotCompute>(LatchCount)) {
+ FailureReason = "could not compute latch count";
+ return None;
+ }
+
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ Value *LeftValue = ICI->getOperand(0);
+ const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
+ IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
+
+ Value *RightValue = ICI->getOperand(1);
+ const SCEV *RightSCEV = SE.getSCEV(RightValue);
+
+ // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
+ if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+ if (isa<SCEVAddRecExpr>(RightSCEV)) {
+ std::swap(LeftSCEV, RightSCEV);
+ std::swap(LeftValue, RightValue);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ } else {
+ FailureReason = "no add recurrences in the icmp";
+ return None;
+ }
+ }
+
+ auto IsInductionVar = [&SE](const SCEVAddRecExpr *AR, bool &IsIncreasing) {
+ if (!AR->isAffine())
+ return false;
+
+ IntegerType *Ty = cast<IntegerType>(AR->getType());
+ IntegerType *WideTy =
+ IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
+
+ // Currently we only work with induction variables that have been proved to
+ // not wrap. This restriction can potentially be lifted in the future.
+
+ const SCEVAddRecExpr *ExtendAfterOp =
+ dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+ if (!ExtendAfterOp)
+ return false;
+
+ const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+ const SCEV *ExtendedStep =
+ SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+
+ bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+ ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+ if (!NoSignedWrap)
+ return false;
+
+ if (const SCEVConstant *StepExpr =
+ dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
+ ConstantInt *StepCI = StepExpr->getValue();
+ if (StepCI->isOne() || StepCI->isMinusOne()) {
+ IsIncreasing = StepCI->isOne();
+ return true;
+ }
+ }
+
+ return false;
+ };
+
+ // `ICI` is interpreted as taking the backedge if the *next* value of the
+ // induction variable satisfies some constraint.
+
+ const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV);
+ bool IsIncreasing = false;
+ if (!IsInductionVar(IndVarNext, IsIncreasing)) {
+ FailureReason = "LHS in icmp not induction variable";
+ return None;
+ }
+
+ ConstantInt *One = ConstantInt::get(IndVarTy, 1);
+ // TODO: generalize the predicates here to also match their unsigned variants.
+ if (IsIncreasing) {
+ bool FoundExpectedPred =
+ (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
+ (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
+
+ if (!FoundExpectedPred) {
+ FailureReason = "expected icmp slt semantically, found something else";
+ return None;
+ }
+
+ if (LatchBrExitIdx == 0) {
+ if (CanBeSMax(SE, RightSCEV)) {
+ // TODO: this restriction is easily removable -- we just have to
+ // remember that the icmp was an slt and not an sle.
+ FailureReason = "limit may overflow when coercing sle to slt";
+ return None;
+ }
+
+ IRBuilder<> B(&*Preheader->rbegin());
+ RightValue = B.CreateAdd(RightValue, One);
+ }
+
+ } else {
+ bool FoundExpectedPred =
+ (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
+ (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
+
+ if (!FoundExpectedPred) {
+ FailureReason = "expected icmp sgt semantically, found something else";
+ return None;
+ }
+
+ if (LatchBrExitIdx == 0) {
+ if (CanBeSMin(SE, RightSCEV)) {
+ // TODO: this restriction is easily removable -- we just have to
+ // remember that the icmp was an sgt and not an sge.
+ FailureReason = "limit may overflow when coercing sge to sgt";
+ return None;
+ }
+
+ IRBuilder<> B(&*Preheader->rbegin());
+ RightValue = B.CreateSub(RightValue, One);
+ }
+ }
+
+ const SCEV *StartNext = IndVarNext->getStart();
+ const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+ const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
+ BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
+
+ assert(SE.getLoopDisposition(LatchCount, &L) ==
+ ScalarEvolution::LoopInvariant &&
+ "loop variant exit count doesn't make sense!");
+
+ assert(!L.contains(LatchExit) && "expected an exit block!");
+
+ Value *IndVarStartV = SCEVExpander(SE, "irce").expandCodeFor(
+ IndVarStart, IndVarTy, &*Preheader->rbegin());
+ IndVarStartV->setName("indvar.start");
+
+ LoopStructure Result;
+
+ Result.Tag = "main";
+ Result.Header = Header;
+ Result.Latch = Latch;
+ Result.LatchBr = LatchBr;
+ Result.LatchExit = LatchExit;
+ Result.LatchBrExitIdx = LatchBrExitIdx;
+ Result.IndVarStart = IndVarStartV;
+ Result.IndVarNext = LeftValue;
+ Result.IndVarIncreasing = IsIncreasing;
+ Result.LoopExitAt = RightValue;
+
+ FailureReason = nullptr;
+
+ return Result;
+}
+
+Optional<LoopConstrainer::SubRanges>
+LoopConstrainer::calculateSubRanges() const {
+ IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
+
+ if (Range.getType() != Ty)
+ return None;
+
+ LoopConstrainer::SubRanges Result;
+
+ // I think we can be more aggressive here and make this nuw / nsw if the
+ // addition that feeds into the icmp for the latch's terminating branch is nuw
+ // / nsw. In any case, a wrapping 2's complement addition is safe.
+ ConstantInt *One = ConstantInt::get(Ty, 1);
+ const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
+ const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
+
+ bool Increasing = MainLoopStructure.IndVarIncreasing;
+ // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
+ // range of values the induction variable takes.
+ const SCEV *Smallest =
+ Increasing ? Start : SE.getAddExpr(End, SE.getSCEV(One));
+ const SCEV *Greatest =
+ Increasing ? End : SE.getAddExpr(Start, SE.getSCEV(One));
+
+ auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
+ return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S));
+ };
+
+ // In some cases we can prove that we don't need a pre or post loop
+
+ bool ProvablyNoPreloop =
+ SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest);
+ if (!ProvablyNoPreloop)
+ Result.LowLimit = Clamp(Range.getBegin());
+
+ bool ProvablyNoPostLoop =
+ SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd());
+ if (!ProvablyNoPostLoop)
+ Result.HighLimit = Clamp(Range.getEnd());
+
+ return Result;
+}
+
+void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
+ const char *Tag) const {
+ for (BasicBlock *BB : OriginalLoop.getBlocks()) {
+ BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
+ Result.Blocks.push_back(Clone);
+ Result.Map[BB] = Clone;
+ }
+
+ auto GetClonedValue = [&Result](Value *V) {
+ assert(V && "null values not in domain!");
+ auto It = Result.Map.find(V);
+ if (It == Result.Map.end())
+ return V;
+ return static_cast<Value *>(It->second);
+ };
+
+ Result.Structure = MainLoopStructure.map(GetClonedValue);
+ Result.Structure.Tag = Tag;
+
+ for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
+ BasicBlock *ClonedBB = Result.Blocks[i];
+ BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
+
+ assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
+
+ for (Instruction &I : *ClonedBB)
+ RemapInstruction(&I, Result.Map,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+ // Exit blocks will now have one more predecessor and their PHI nodes need
+ // to be edited to reflect that. No phi nodes need to be introduced because
+ // the loop is in LCSSA.
+
+ for (auto SBBI = succ_begin(OriginalBB), SBBE = succ_end(OriginalBB);
+ SBBI != SBBE; ++SBBI) {
+
+ if (OriginalLoop.contains(*SBBI))
+ continue; // not an exit block
+
+ for (Instruction &I : **SBBI) {
+ if (!isa<PHINode>(&I))
+ break;
+
+ PHINode *PN = cast<PHINode>(&I);
+ Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB);
+ PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+ }
+ }
+ }
+}
+
+LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
+ const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
+ BasicBlock *ContinuationBlock) const {
+
+ // We start with a loop with a single latch:
+ //
+ // +--------------------+
+ // | |
+ // | preheader |
+ // | |
+ // +--------+-----------+
+ // | ----------------\
+ // | / |
+ // +--------v----v------+ |
+ // | | |
+ // | header | |
+ // | | |
+ // +--------------------+ |
+ // |
+ // ..... |
+ // |
+ // +--------------------+ |
+ // | | |
+ // | latch >----------/
+ // | |
+ // +-------v------------+
+ // |
+ // |
+ // | +--------------------+
+ // | | |
+ // +---> original exit |
+ // | |
+ // +--------------------+
+ //
+ // We change the control flow to look like
+ //
+ //
+ // +--------------------+
+ // | |
+ // | preheader >-------------------------+
+ // | | |
+ // +--------v-----------+ |
+ // | /-------------+ |
+ // | / | |
+ // +--------v--v--------+ | |
+ // | | | |
+ // | header | | +--------+ |
+ // | | | | | |
+ // +--------------------+ | | +-----v-----v-----------+
+ // | | | |
+ // | | | .pseudo.exit |
+ // | | | |
+ // | | +-----------v-----------+
+ // | | |
+ // ..... | | |
+ // | | +--------v-------------+
+ // +--------------------+ | | | |
+ // | | | | | ContinuationBlock |
+ // | latch >------+ | | |
+ // | | | +----------------------+
+ // +---------v----------+ |
+ // | |
+ // | |
+ // | +---------------^-----+
+ // | | |
+ // +-----> .exit.selector |
+ // | |
+ // +----------v----------+
+ // |
+ // +--------------------+ |
+ // | | |
+ // | original exit <----+
+ // | |
+ // +--------------------+
+ //
+
+ RewrittenRangeInfo RRI;
+
+ auto BBInsertLocation = std::next(Function::iterator(LS.Latch));
+ RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
+ &F, BBInsertLocation);
+ RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
+ BBInsertLocation);
+
+ BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin());
+ bool Increasing = LS.IndVarIncreasing;
+
+ IRBuilder<> B(PreheaderJump);
+
+ // EnterLoopCond - is it okay to start executing this `LS'?
+ Value *EnterLoopCond = Increasing
+ ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
+ : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt);
+
+ B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
+ PreheaderJump->eraseFromParent();
+
+ LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
+ B.SetInsertPoint(LS.LatchBr);
+ Value *TakeBackedgeLoopCond =
+ Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt)
+ : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt);
+ Value *CondForBranch = LS.LatchBrExitIdx == 1
+ ? TakeBackedgeLoopCond
+ : B.CreateNot(TakeBackedgeLoopCond);
+
+ LS.LatchBr->setCondition(CondForBranch);
+
+ B.SetInsertPoint(RRI.ExitSelector);
+
+ // IterationsLeft - are there any more iterations left, given the original
+ // upper bound on the induction variable? If not, we branch to the "real"
+ // exit.
+ Value *IterationsLeft = Increasing
+ ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt)
+ : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt);
+ B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
+
+ BranchInst *BranchToContinuation =
+ BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+
+ // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
+ // each of the PHI nodes in the loop header. This feeds into the initial
+ // value of the same PHI nodes if/when we continue execution.
+ for (Instruction &I : *LS.Header) {
+ if (!isa<PHINode>(&I))
+ break;
+
+ PHINode *PN = cast<PHINode>(&I);
+
+ PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy",
+ BranchToContinuation);
+
+ NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader);
+ NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch),
+ RRI.ExitSelector);
+ RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
+ }
+
+ RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end",
+ BranchToContinuation);
+ RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader);
+ RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector);
+
+ // The latch exit now has a branch from `RRI.ExitSelector' instead of
+ // `LS.Latch'. The PHI nodes need to be updated to reflect that.
+ for (Instruction &I : *LS.LatchExit) {
+ if (PHINode *PN = dyn_cast<PHINode>(&I))
+ replacePHIBlock(PN, LS.Latch, RRI.ExitSelector);
+ else
+ break;
+ }
+
+ return RRI;
+}
+
+void LoopConstrainer::rewriteIncomingValuesForPHIs(
+ LoopStructure &LS, BasicBlock *ContinuationBlock,
+ const LoopConstrainer::RewrittenRangeInfo &RRI) const {
+
+ unsigned PHIIndex = 0;
+ for (Instruction &I : *LS.Header) {
+ if (!isa<PHINode>(&I))
+ break;
+
+ PHINode *PN = cast<PHINode>(&I);
+
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+ if (PN->getIncomingBlock(i) == ContinuationBlock)
+ PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]);
+ }
+
+ LS.IndVarStart = RRI.IndVarEnd;
+}
+
+BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
+ BasicBlock *OldPreheader,
+ const char *Tag) const {
+
+ BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
+ BranchInst::Create(LS.Header, Preheader);
+
+ for (Instruction &I : *LS.Header) {
+ if (!isa<PHINode>(&I))
+ break;
+
+ PHINode *PN = cast<PHINode>(&I);
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+ replacePHIBlock(PN, OldPreheader, Preheader);
+ }
+
+ return Preheader;
+}
+
+void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
+ Loop *ParentLoop = OriginalLoop.getParentLoop();
+ if (!ParentLoop)
+ return;
+
+ for (BasicBlock *BB : BBs)
+ ParentLoop->addBasicBlockToLoop(BB, OriginalLoopInfo);
+}
+
+bool LoopConstrainer::run() {
+ BasicBlock *Preheader = nullptr;
+ LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
+ Preheader = OriginalLoop.getLoopPreheader();
+ assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
+ "preconditions!");
+
+ OriginalPreheader = Preheader;
+ MainLoopPreheader = Preheader;
+
+ Optional<SubRanges> MaybeSR = calculateSubRanges();
+ if (!MaybeSR.hasValue()) {
+ DEBUG(dbgs() << "irce: could not compute subranges\n");
+ return false;
+ }
+
+ SubRanges SR = MaybeSR.getValue();
+ bool Increasing = MainLoopStructure.IndVarIncreasing;
+ IntegerType *IVTy =
+ cast<IntegerType>(MainLoopStructure.IndVarNext->getType());
+
+ SCEVExpander Expander(SE, "irce");
+ Instruction *InsertPt = OriginalPreheader->getTerminator();
+
+ // It would have been better to make `PreLoop' and `PostLoop'
+ // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
+ // constructor.
+ ClonedLoop PreLoop, PostLoop;
+ bool NeedsPreLoop =
+ Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
+ bool NeedsPostLoop =
+ Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
+
+ Value *ExitPreLoopAt = nullptr;
+ Value *ExitMainLoopAt = nullptr;
+ const SCEVConstant *MinusOneS =
+ cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
+
+ if (NeedsPreLoop) {
+ const SCEV *ExitPreLoopAtSCEV = nullptr;
+
+ if (Increasing)
+ ExitPreLoopAtSCEV = *SR.LowLimit;
+ else {
+ if (CanBeSMin(SE, *SR.HighLimit)) {
+ DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "preloop exit limit. HighLimit = " << *(*SR.HighLimit)
+ << "\n");
+ return false;
+ }
+ ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+ }
+
+ ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
+ ExitPreLoopAt->setName("exit.preloop.at");
+ }
+
+ if (NeedsPostLoop) {
+ const SCEV *ExitMainLoopAtSCEV = nullptr;
+
+ if (Increasing)
+ ExitMainLoopAtSCEV = *SR.HighLimit;
+ else {
+ if (CanBeSMin(SE, *SR.LowLimit)) {
+ DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "mainloop exit limit. LowLimit = " << *(*SR.LowLimit)
+ << "\n");
+ return false;
+ }
+ ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+ }
+
+ ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
+ ExitMainLoopAt->setName("exit.mainloop.at");
+ }
+
+ // We clone these ahead of time so that we don't have to deal with changing
+ // and temporarily invalid IR as we transform the loops.
+ if (NeedsPreLoop)
+ cloneLoop(PreLoop, "preloop");
+ if (NeedsPostLoop)
+ cloneLoop(PostLoop, "postloop");
+
+ RewrittenRangeInfo PreLoopRRI;
+
+ if (NeedsPreLoop) {
+ Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
+ PreLoop.Structure.Header);
+
+ MainLoopPreheader =
+ createPreheader(MainLoopStructure, Preheader, "mainloop");
+ PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
+ ExitPreLoopAt, MainLoopPreheader);
+ rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
+ PreLoopRRI);
+ }
+
+ BasicBlock *PostLoopPreheader = nullptr;
+ RewrittenRangeInfo PostLoopRRI;
+
+ if (NeedsPostLoop) {
+ PostLoopPreheader =
+ createPreheader(PostLoop.Structure, Preheader, "postloop");
+ PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
+ ExitMainLoopAt, PostLoopPreheader);
+ rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
+ PostLoopRRI);
+ }
+
+ BasicBlock *NewMainLoopPreheader =
+ MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
+ BasicBlock *NewBlocks[] = {PostLoopPreheader, PreLoopRRI.PseudoExit,
+ PreLoopRRI.ExitSelector, PostLoopRRI.PseudoExit,
+ PostLoopRRI.ExitSelector, NewMainLoopPreheader};
+
+ // Some of the above may be nullptr, filter them out before passing to
+ // addToParentLoopIfNeeded.
+ auto NewBlocksEnd =
+ std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
+
+ addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
+ addToParentLoopIfNeeded(PreLoop.Blocks);
+ addToParentLoopIfNeeded(PostLoop.Blocks);
+
+ return true;
+}
+
+/// Computes and returns a range of values for the induction variable (IndVar)
+/// in which the range check can be safely elided. If it cannot compute such a
+/// range, returns None.
+Optional<InductiveRangeCheck::Range>
+InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE,
+ const SCEVAddRecExpr *IndVar,
+ IRBuilder<> &) const {
+ // IndVar is of the form "A + B * I" (where "I" is the canonical induction
+ // variable, that may or may not exist as a real llvm::Value in the loop) and
+ // this inductive range check is a range check on the "C + D * I" ("C" is
+ // getOffset() and "D" is getScale()). We rewrite the value being range
+ // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
+ // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code
+ // can be generalized as needed.
+ //
+ // The actual inequalities we solve are of the form
+ //
+ // 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1)
+ //
+ // The inequality is satisfied by -M <= IndVar < (L - M) [^1]. All additions
+ // and subtractions are twos-complement wrapping and comparisons are signed.
+ //
+ // Proof:
+ //
+ // If there exists IndVar such that -M <= IndVar < (L - M) then it follows
+ // that -M <= (-M + L) [== Eq. 1]. Since L >= 0, if (-M + L) sign-overflows
+ // then (-M + L) < (-M). Hence by [Eq. 1], (-M + L) could not have
+ // overflown.
+ //
+ // This means IndVar = t + (-M) for t in [0, L). Hence (IndVar + M) = t.
+ // Hence 0 <= (IndVar + M) < L
+
+ // [^1]: Note that the solution does _not_ apply if L < 0; consider values M =
+ // 127, IndVar = 126 and L = -2 in an i8 world.
+
+ if (!IndVar->isAffine())
+ return None;
+
+ const SCEV *A = IndVar->getStart();
+ const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE));
+ if (!B)
+ return None;
+
+ const SCEV *C = getOffset();
+ const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale());
+ if (D != B)
+ return None;
+
+ ConstantInt *ConstD = D->getValue();
+ if (!(ConstD->isMinusOne() || ConstD->isOne()))
+ return None;
+
+ const SCEV *M = SE.getMinusSCEV(C, A);
+
+ const SCEV *Begin = SE.getNegativeSCEV(M);
+ const SCEV *End = SE.getMinusSCEV(SE.getSCEV(getLength()), M);
+
+ return InductiveRangeCheck::Range(Begin, End);
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectRange(ScalarEvolution &SE,
+ const Optional<InductiveRangeCheck::Range> &R1,
+ const InductiveRangeCheck::Range &R2, IRBuilder<> &B) {
+ if (!R1.hasValue())
+ return R2;
+ auto &R1Value = R1.getValue();
+
+ // TODO: we could widen the smaller range and have this work; but for now we
+ // bail out to keep things simple.
+ if (R1Value.getType() != R2.getType())
+ return None;
+
+ const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
+ const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
+
+ return InductiveRangeCheck::Range(NewBegin, NewEnd);
+}
+
+bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (L->getBlocks().size() >= LoopSizeCutoff) {
+ DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
+ return false;
+ }
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) {
+ DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+ return false;
+ }
+
+ LLVMContext &Context = Preheader->getContext();
+ InductiveRangeCheck::AllocatorTy IRCAlloc;
+ SmallVector<InductiveRangeCheck *, 16> RangeChecks;
+ ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
+ BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
+
+ for (auto BBI : L->getBlocks())
+ if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
+ if (InductiveRangeCheck *IRC =
+ InductiveRangeCheck::create(IRCAlloc, TBI, L, SE, BPI))
+ RangeChecks.push_back(IRC);
+
+ if (RangeChecks.empty())
+ return false;
+
+ DEBUG(dbgs() << "irce: looking at loop "; L->print(dbgs());
+ dbgs() << "irce: loop has " << RangeChecks.size()
+ << " inductive range checks: \n";
+ for (InductiveRangeCheck *IRC : RangeChecks)
+ IRC->print(dbgs());
+ );
+
+ const char *FailureReason = nullptr;
+ Optional<LoopStructure> MaybeLoopStructure =
+ LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
+ if (!MaybeLoopStructure.hasValue()) {
+ DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason
+ << "\n";);
+ return false;
+ }
+ LoopStructure LS = MaybeLoopStructure.getValue();
+ bool Increasing = LS.IndVarIncreasing;
+ const SCEV *MinusOne =
+ SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true);
+ const SCEVAddRecExpr *IndVar =
+ cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne));
+
+ Optional<InductiveRangeCheck::Range> SafeIterRange;
+ Instruction *ExprInsertPt = Preheader->getTerminator();
+
+ SmallVector<InductiveRangeCheck *, 4> RangeChecksToEliminate;
+
+ IRBuilder<> B(ExprInsertPt);
+ for (InductiveRangeCheck *IRC : RangeChecks) {
+ auto Result = IRC->computeSafeIterationSpace(SE, IndVar, B);
+ if (Result.hasValue()) {
+ auto MaybeSafeIterRange =
+ IntersectRange(SE, SafeIterRange, Result.getValue(), B);
+ if (MaybeSafeIterRange.hasValue()) {
+ RangeChecksToEliminate.push_back(IRC);
+ SafeIterRange = MaybeSafeIterRange.getValue();
+ }
+ }
+ }
+
+ if (!SafeIterRange.hasValue())
+ return false;
+
+ LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LS,
+ SE, SafeIterRange.getValue());
+ bool Changed = LC.run();
+
+ if (Changed) {
+ auto PrintConstrainedLoopInfo = [L]() {
+ dbgs() << "irce: in function ";
+ dbgs() << L->getHeader()->getParent()->getName() << ": ";
+ dbgs() << "constrained ";
+ L->print(dbgs());
+ };
+
+ DEBUG(PrintConstrainedLoopInfo());
+
+ if (PrintChangedLoops)
+ PrintConstrainedLoopInfo();
+
+ // Optimize away the now-redundant range checks.
+
+ for (InductiveRangeCheck *IRC : RangeChecksToEliminate) {
+ ConstantInt *FoldedRangeCheck = IRC->getPassingDirection()
+ ? ConstantInt::getTrue(Context)
+ : ConstantInt::getFalse(Context);
+ IRC->getBranch()->setCondition(FoldedRangeCheck);
+ }
+ }
+
+ return Changed;
+}
+
+Pass *llvm::createInductiveRangeCheckEliminationPass() {
+ return new InductiveRangeCheckElimination;
+}
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 60a4925..8b54abd 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -32,7 +32,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -115,7 +115,7 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LazyValueInfo>();
AU.addPreserved<LazyValueInfo>();
- AU.addRequired<TargetLibraryInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
void FindLoopHeaders(Function &F);
@@ -145,7 +145,7 @@ char JumpThreading::ID = 0;
INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
"Jump Threading", false, false)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(JumpThreading, "jump-threading",
"Jump Threading", false, false)
@@ -161,7 +161,7 @@ bool JumpThreading::runOnFunction(Function &F) {
DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
- TLI = &getAnalysis<TargetLibraryInfo>();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
LVI = &getAnalysis<LazyValueInfo>();
// Remove unreachable blocks from function as they may result in infinite
@@ -188,7 +188,7 @@ bool JumpThreading::runOnFunction(Function &F) {
// If the block is trivially dead, zap it. This eliminates the successor
// edges which simplifies the CFG.
- if (pred_begin(BB) == pred_end(BB) &&
+ if (pred_empty(BB) &&
BB != &BB->getParent()->getEntryBlock()) {
DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName()
<< "' with terminator: " << *BB->getTerminator() << '\n');
@@ -662,7 +662,7 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) {
bool JumpThreading::ProcessBlock(BasicBlock *BB) {
// If the block is trivially dead, just return and let the caller nuke it.
// This simplifies other transformations.
- if (pred_begin(BB) == pred_end(BB) &&
+ if (pred_empty(BB) &&
BB != &BB->getParent()->getEntryBlock())
return false;
@@ -797,7 +797,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
}
} else if (CondBr && CondConst && CondBr->isConditional()) {
- // There might be an invairant in the same block with the conditional
+ // There might be an invariant in the same block with the conditional
// that can determine the predicate.
LazyValueInfo::Tristate Ret =
@@ -902,8 +902,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// only happen in dead loops.
if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
if (AvailableVal->getType() != LI->getType())
- AvailableVal = CastInst::Create(CastInst::BitCast, AvailableVal,
- LI->getType(), "", LI);
+ AvailableVal =
+ CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
LI->replaceAllUsesWith(AvailableVal);
LI->eraseFromParent();
return true;
@@ -993,7 +993,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// Split them out to their own block.
UnavailablePred =
- SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split", this);
+ SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split");
}
// If the value isn't available in all predecessors, then there will be
@@ -1040,8 +1040,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
// predecessor use the same bitcast.
Value *&PredV = I->second;
if (PredV->getType() != LI->getType())
- PredV = CastInst::Create(CastInst::BitCast, PredV, LI->getType(), "",
- P->getTerminator());
+ PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+ P->getTerminator());
PN->addIncoming(PredV, I->first);
}
@@ -1418,7 +1418,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
else {
DEBUG(dbgs() << " Factoring out " << PredBBs.size()
<< " common predecessors.\n");
- PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
+ PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm");
}
// And finally, do it!
@@ -1561,7 +1561,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
else {
DEBUG(dbgs() << " Factoring out " << PredBBs.size()
<< " common predecessors.\n");
- PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
+ PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm");
}
// Okay, we decided to do this! Clone all the instructions in BB onto the end
@@ -1575,7 +1575,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
- PredBB = SplitEdge(PredBB, BB, this);
+ PredBB = SplitEdge(PredBB, BB);
OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
}
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 5f00bb9..14af38b 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -52,7 +52,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -71,6 +71,27 @@ static cl::opt<bool>
DisablePromotion("disable-licm-promotion", cl::Hidden,
cl::desc("Disable memory promotion in LICM pass"));
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
+static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop);
+static bool hoist(Instruction &I, BasicBlock *Preheader);
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+ Loop *CurLoop, AliasSetTracker *CurAST );
+static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT,
+ Loop *CurLoop, LICMSafetyInfo * SafetyInfo);
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,DominatorTree *DT,
+ const DataLayout *DL, Loop *CurLoop,
+ LICMSafetyInfo * SafetyInfo);
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+ const AAMDNodes &AAInfo,
+ AliasSetTracker *CurAST);
+static Instruction *CloneInstructionInExitBlock(Instruction &I,
+ BasicBlock &ExitBlock,
+ PHINode &PN, LoopInfo *LI);
+static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
+ DominatorTree *DT, const DataLayout *DL,
+ Loop *CurLoop, AliasSetTracker *CurAST,
+ LICMSafetyInfo * SafetyInfo);
+
namespace {
struct LICM : public LoopPass {
static char ID; // Pass identification, replacement for typeid
@@ -86,7 +107,7 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addRequiredID(LoopSimplifyID);
AU.addPreservedID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
@@ -94,7 +115,7 @@ namespace {
AU.addRequired<AliasAnalysis>();
AU.addPreserved<AliasAnalysis>();
AU.addPreserved<ScalarEvolution>();
- AU.addRequired<TargetLibraryInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
using llvm::Pass::doFinalization;
@@ -117,9 +138,6 @@ namespace {
BasicBlock *Preheader; // The preheader block of the current loop...
Loop *CurLoop; // The current loop we are working on...
AliasSetTracker *CurAST; // AliasSet information for the current loop...
- bool MayThrow; // The current loop contains an instruction which
- // may throw, thus preventing code motion of
- // instructions with side effects.
DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
@@ -132,88 +150,17 @@ namespace {
/// Simple Analysis hook. Delete loop L from alias set map.
void deleteAnalysisLoop(Loop *L) override;
-
- /// SinkRegion - Walk the specified region of the CFG (defined by all blocks
- /// dominated by the specified block, and that are in the current loop) in
- /// reverse depth first order w.r.t the DominatorTree. This allows us to
- /// visit uses before definitions, allowing us to sink a loop body in one
- /// pass without iteration.
- ///
- void SinkRegion(DomTreeNode *N);
-
- /// HoistRegion - Walk the specified region of the CFG (defined by all
- /// blocks dominated by the specified block, and that are in the current
- /// loop) in depth first order w.r.t the DominatorTree. This allows us to
- /// visit definitions before uses, allowing us to hoist a loop body in one
- /// pass without iteration.
- ///
- void HoistRegion(DomTreeNode *N);
-
- /// inSubLoop - Little predicate that returns true if the specified basic
- /// block is in a subloop of the current one, not the current one itself.
- ///
- bool inSubLoop(BasicBlock *BB) {
- assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
- return LI->getLoopFor(BB) != CurLoop;
- }
-
- /// sink - When an instruction is found to only be used outside of the loop,
- /// this function moves it to the exit blocks and patches up SSA form as
- /// needed.
- ///
- void sink(Instruction &I);
-
- /// hoist - When an instruction is found to only use loop invariant operands
- /// that is safe to hoist, this instruction is called to do the dirty work.
- ///
- void hoist(Instruction &I);
-
- /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it
- /// is not a trapping instruction or if it is a trapping instruction and is
- /// guaranteed to execute.
- ///
- bool isSafeToExecuteUnconditionally(Instruction &I);
-
- /// isGuaranteedToExecute - Check that the instruction is guaranteed to
- /// execute.
- ///
- bool isGuaranteedToExecute(Instruction &I);
-
- /// pointerInvalidatedByLoop - Return true if the body of this loop may
- /// store into the memory location pointed to by V.
- ///
- bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
- const AAMDNodes &AAInfo) {
- // Check to see if any of the basic blocks in CurLoop invalidate *V.
- return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
- }
-
- bool canSinkOrHoistInst(Instruction &I);
- bool isNotUsedInLoop(Instruction &I);
-
- void PromoteAliasSet(AliasSet &AS,
- SmallVectorImpl<BasicBlock*> &ExitBlocks,
- SmallVectorImpl<Instruction*> &InsertPts,
- PredIteratorCache &PIC);
-
- /// \brief Create a copy of the instruction in the exit block and patch up
- /// SSA.
- /// PN is a user of I in ExitBlock that can be used to get the number and
- /// list of predecessors fast.
- Instruction *CloneInstructionInExitBlock(Instruction &I,
- BasicBlock &ExitBlock,
- PHINode &PN);
};
}
char LICM::ID = 0;
INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)
@@ -230,13 +177,13 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
Changed = false;
// Get our Loop and Alias Analysis information...
- LI = &getAnalysis<LoopInfo>();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
AA = &getAnalysis<AliasAnalysis>();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
- TLI = &getAnalysis<TargetLibraryInfo>();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -273,14 +220,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
CurAST->add(*BB); // Incorporate the specified basic block
}
- MayThrow = false;
- // TODO: We've already searched for instructions which may throw in subloops.
- // We may want to reuse this information.
- for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end();
- (BB != BBE) && !MayThrow ; ++BB)
- for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
- (I != E) && !MayThrow; ++I)
- MayThrow |= I->mayThrow();
+ // Compute loop safety information.
+ LICMSafetyInfo SafetyInfo;
+ computeLICMSafetyInfo(&SafetyInfo, CurLoop);
// We want to visit all of the instructions in this loop... that are not parts
// of our subloops (they have already had their invariants hoisted out of
@@ -293,9 +235,11 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
// instructions, we perform another pass to hoist them out of the loop.
//
if (L->hasDedicatedExits())
- SinkRegion(DT->getNode(L->getHeader()));
+ Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI,
+ CurLoop, CurAST, &SafetyInfo);
if (Preheader)
- HoistRegion(DT->getNode(L->getHeader()));
+ Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI,
+ CurLoop, CurAST, &SafetyInfo);
// Now that all loop invariants have been removed from the loop, promote any
// memory references to scalars that we can.
@@ -307,7 +251,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
// Loop over all of the alias sets in the tracker object.
for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
I != E; ++I)
- PromoteAliasSet(*I, ExitBlocks, InsertPts, PIC);
+ Changed |= promoteLoopAccessesToScalars(*I, ExitBlocks, InsertPts,
+ PIC, LI, DT, CurLoop,
+ CurAST, &SafetyInfo);
// Once we have promoted values across the loop body we have to recursively
// reform LCSSA as any nested loop may now have values defined within the
@@ -316,7 +262,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
// SSAUpdater strategy during promotion that was LCSSA aware and reformed
// it as it went.
if (Changed)
- formLCSSARecursively(*L, *DT, getAnalysisIfAvailable<ScalarEvolution>());
+ formLCSSARecursively(*L, *DT, LI,
+ getAnalysisIfAvailable<ScalarEvolution>());
}
// Check that neither this loop nor its parent have had LCSSA broken. LICM is
@@ -339,27 +286,36 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
return Changed;
}
-/// SinkRegion - Walk the specified region of the CFG (defined by all blocks
-/// dominated by the specified block, and that are in the current loop) in
-/// reverse depth first order w.r.t the DominatorTree. This allows us to visit
-/// uses before definitions, allowing us to sink a loop body in one pass without
-/// iteration.
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in reverse depth
+/// first order w.r.t the DominatorTree. This allows us to visit uses before
+/// definitions, allowing us to sink a loop body in one pass without iteration.
///
-void LICM::SinkRegion(DomTreeNode *N) {
- assert(N != nullptr && "Null dominator tree node?");
+bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+ DominatorTree *DT, const DataLayout *DL,
+ TargetLibraryInfo *TLI, Loop *CurLoop,
+ AliasSetTracker *CurAST, LICMSafetyInfo * SafetyInfo) {
+
+ // Verify inputs.
+ assert(N != nullptr && AA != nullptr && LI != nullptr &&
+ DT != nullptr && CurLoop != nullptr && CurAST != nullptr &&
+ SafetyInfo != nullptr && "Unexpected input to sinkRegion");
+
+ // Set changed as false.
+ bool Changed = false;
+ // Get basic block
BasicBlock *BB = N->getBlock();
-
// If this subregion is not in the top level loop at all, exit.
- if (!CurLoop->contains(BB)) return;
+ if (!CurLoop->contains(BB)) return Changed;
// We are processing blocks in reverse dfo, so process children first.
const std::vector<DomTreeNode*> &Children = N->getChildren();
for (unsigned i = 0, e = Children.size(); i != e; ++i)
- SinkRegion(Children[i]);
-
+ Changed |= sinkRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop,
+ CurAST, SafetyInfo);
// Only need to process the contents of this block if it is not part of a
// subloop (which would already have been processed).
- if (inSubLoop(BB)) return;
+ if (inSubLoop(BB,CurLoop,LI)) return Changed;
for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
Instruction &I = *--II;
@@ -380,31 +336,39 @@ void LICM::SinkRegion(DomTreeNode *N) {
// outside of the loop. In this case, it doesn't even matter if the
// operands of the instruction are loop invariant.
//
- if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) {
+ if (isNotUsedInLoop(I, CurLoop) &&
+ canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo)) {
++II;
- sink(I);
+ Changed |= sink(I, LI, DT, CurLoop, CurAST);
}
}
+ return Changed;
}
-/// HoistRegion - Walk the specified region of the CFG (defined by all blocks
-/// dominated by the specified block, and that are in the current loop) in depth
-/// first order w.r.t the DominatorTree. This allows us to visit definitions
-/// before uses, allowing us to hoist a loop body in one pass without iteration.
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree. This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
///
-void LICM::HoistRegion(DomTreeNode *N) {
- assert(N != nullptr && "Null dominator tree node?");
+bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+ DominatorTree *DT, const DataLayout *DL,
+ TargetLibraryInfo *TLI, Loop *CurLoop,
+ AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+ // Verify inputs.
+ assert(N != nullptr && AA != nullptr && LI != nullptr &&
+ DT != nullptr && CurLoop != nullptr && CurAST != nullptr &&
+ SafetyInfo != nullptr && "Unexpected input to hoistRegion");
+ // Set changed as false.
+ bool Changed = false;
+ // Get basic block
BasicBlock *BB = N->getBlock();
-
// If this subregion is not in the top level loop at all, exit.
- if (!CurLoop->contains(BB)) return;
-
+ if (!CurLoop->contains(BB)) return Changed;
// Only need to process the contents of this block if it is not part of a
// subloop (which would already have been processed).
- if (!inSubLoop(BB))
+ if (!inSubLoop(BB, CurLoop, LI))
for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
Instruction &I = *II++;
-
// Try constant folding this instruction. If all the operands are
// constants, it is technically hoistable, but it would be better to just
// fold it.
@@ -421,20 +385,49 @@ void LICM::HoistRegion(DomTreeNode *N) {
// if all of the operands of the instruction are loop invariant and if it
// is safe to hoist the instruction.
//
- if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I) &&
- isSafeToExecuteUnconditionally(I))
- hoist(I);
+ if (CurLoop->hasLoopInvariantOperands(&I) &&
+ canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo) &&
+ isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo))
+ Changed |= hoist(I, CurLoop->getLoopPreheader());
}
const std::vector<DomTreeNode*> &Children = N->getChildren();
for (unsigned i = 0, e = Children.size(); i != e; ++i)
- HoistRegion(Children[i]);
+ Changed |= hoistRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop,
+ CurAST, SafetyInfo);
+ return Changed;
+}
+
+/// Computes loop safety information, checks loop body & header
+/// for the possiblity of may throw exception.
+///
+void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
+ assert(CurLoop != nullptr && "CurLoop cant be null");
+ BasicBlock *Header = CurLoop->getHeader();
+ // Setting default safety values.
+ SafetyInfo->MayThrow = false;
+ SafetyInfo->HeaderMayThrow = false;
+ // Iterate over header and compute dafety info.
+ for (BasicBlock::iterator I = Header->begin(), E = Header->end();
+ (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
+ SafetyInfo->HeaderMayThrow |= I->mayThrow();
+
+ SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
+ // Iterate over loop instructions and compute safety info.
+ for (Loop::block_iterator BB = CurLoop->block_begin(),
+ BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow ; ++BB)
+ for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+ (I != E) && !SafetyInfo->MayThrow; ++I)
+ SafetyInfo->MayThrow |= I->mayThrow();
}
/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
/// instruction.
///
-bool LICM::canSinkOrHoistInst(Instruction &I) {
+bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
+ DominatorTree *DT, const DataLayout *DL,
+ Loop *CurLoop, AliasSetTracker *CurAST,
+ LICMSafetyInfo * SafetyInfo) {
// Loads have extra constraints we have to verify before we can hoist them.
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
if (!LI->isUnordered())
@@ -455,7 +448,7 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
AAMDNodes AAInfo;
LI->getAAMetadata(AAInfo);
- return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo);
+ return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
} else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
// Don't sink or hoist dbg info; it's legal, but not useful.
if (isa<DbgInfoIntrinsic>(I))
@@ -494,14 +487,14 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
!isa<InsertValueInst>(I))
return false;
- return isSafeToExecuteUnconditionally(I);
+ return isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo);
}
-/// \brief Returns true if a PHINode is a trivially replaceable with an
+/// Returns true if a PHINode is a trivially replaceable with an
/// Instruction.
+/// This is true when all incoming values are that instruction.
+/// This pattern occurs most often with LCSSA PHI nodes.
///
-/// This is true when all incoming values are that instruction. This pattern
-/// occurs most often with LCSSA PHI nodes.
static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) {
for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
if (PN.getIncomingValue(i) != &I)
@@ -510,11 +503,11 @@ static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) {
return true;
}
-/// isNotUsedInLoop - Return true if the only users of this instruction are
-/// outside of the loop. If this is true, we can sink the instruction to the
-/// exit blocks of the loop.
+/// Return true if the only users of this instruction are outside of
+/// the loop. If this is true, we can sink the instruction to the exit
+/// blocks of the loop.
///
-bool LICM::isNotUsedInLoop(Instruction &I) {
+static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop) {
for (User *U : I.users()) {
Instruction *UI = cast<Instruction>(U);
if (PHINode *PN = dyn_cast<PHINode>(UI)) {
@@ -545,9 +538,9 @@ bool LICM::isNotUsedInLoop(Instruction &I) {
return true;
}
-Instruction *LICM::CloneInstructionInExitBlock(Instruction &I,
- BasicBlock &ExitBlock,
- PHINode &PN) {
+static Instruction *CloneInstructionInExitBlock(Instruction &I,
+ BasicBlock &ExitBlock,
+ PHINode &PN, LoopInfo *LI) {
Instruction *New = I.clone();
ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
if (!I.getName().empty()) New->setName(I.getName() + ".le");
@@ -574,14 +567,15 @@ Instruction *LICM::CloneInstructionInExitBlock(Instruction &I,
return New;
}
-/// sink - When an instruction is found to only be used outside of the loop,
-/// this function moves it to the exit blocks and patches up SSA form as needed.
+/// When an instruction is found to only be used outside of the loop, this
+/// function moves it to the exit blocks and patches up SSA form as needed.
/// This method is guaranteed to remove the original instruction from its
/// position, and may either delete it or move it to outside of the loop.
///
-void LICM::sink(Instruction &I) {
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+ Loop *CurLoop, AliasSetTracker *CurAST ) {
DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
-
+ bool Changed = false;
if (isa<LoadInst>(I)) ++NumMovedLoads;
else if (isa<CallInst>(I)) ++NumMovedCalls;
++NumSunk;
@@ -590,7 +584,8 @@ void LICM::sink(Instruction &I) {
#ifndef NDEBUG
SmallVector<BasicBlock *, 32> ExitBlocks;
CurLoop->getUniqueExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
#endif
// Clones of this instruction. Don't create more than one per exit block!
@@ -618,7 +613,7 @@ void LICM::sink(Instruction &I) {
New = It->second;
else
New = SunkCopies[ExitBlock] =
- CloneInstructionInExitBlock(I, *ExitBlock, *PN);
+ CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI);
PN->replaceAllUsesWith(New);
PN->eraseFromParent();
@@ -626,44 +621,41 @@ void LICM::sink(Instruction &I) {
CurAST->deleteValue(&I);
I.eraseFromParent();
+ return Changed;
}
-/// hoist - When an instruction is found to only use loop invariant operands
-/// that is safe to hoist, this instruction is called to do the dirty work.
+/// When an instruction is found to only use loop invariant operands that
+/// is safe to hoist, this instruction is called to do the dirty work.
///
-void LICM::hoist(Instruction &I) {
+static bool hoist(Instruction &I, BasicBlock *Preheader) {
DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": "
<< I << "\n");
-
// Move the new node to the Preheader, before its terminator.
I.moveBefore(Preheader->getTerminator());
if (isa<LoadInst>(I)) ++NumMovedLoads;
else if (isa<CallInst>(I)) ++NumMovedCalls;
++NumHoisted;
- Changed = true;
+ return true;
}
-/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is
-/// not a trapping instruction or if it is a trapping instruction and is
-/// guaranteed to execute.
+/// Only sink or hoist an instruction if it is not a trapping instruction
+/// or if it is a trapping instruction and is guaranteed to execute.
///
-bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
+static bool isSafeToExecuteUnconditionally(Instruction &Inst, DominatorTree *DT,
+ const DataLayout *DL, Loop *CurLoop,
+ LICMSafetyInfo * SafetyInfo) {
// If it is not a trapping instruction, it is always safe to hoist.
if (isSafeToSpeculativelyExecute(&Inst, DL))
return true;
- return isGuaranteedToExecute(Inst);
+ return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
}
-bool LICM::isGuaranteedToExecute(Instruction &Inst) {
-
- // Somewhere in this loop there is an instruction which may throw and make us
- // exit the loop.
- if (MayThrow)
- return false;
+static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT,
+ Loop *CurLoop, LICMSafetyInfo * SafetyInfo) {
- // Otherwise we have to check to make sure that the instruction dominates all
+ // We have to check to make sure that the instruction dominates all
// of the exit blocks. If it doesn't, then there is a path out of the loop
// which does not execute this instruction, so we can't hoist it.
@@ -671,7 +663,14 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) {
// common), it is always guaranteed to dominate the exit blocks. Since this
// is a common case, and can save some work, check it now.
if (Inst.getParent() == CurLoop->getHeader())
- return true;
+ // If there's a throw in the header block, we can't guarantee we'll reach
+ // Inst.
+ return !SafetyInfo->HeaderMayThrow;
+
+ // Somewhere in this loop there is an instruction which may throw and make us
+ // exit the loop.
+ if (SafetyInfo->MayThrow)
+ return false;
// Get the exit blocks for the current loop.
SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -768,25 +767,37 @@ namespace {
};
} // end anon namespace
-/// PromoteAliasSet - Try to promote memory values to scalars by sinking
-/// stores out of the loop and moving loads to before the loop. We do this by
-/// looping over the stores in the loop, looking for stores to Must pointers
-/// which are loop invariant.
+/// Try to promote memory values to scalars by sinking stores out of the
+/// loop and moving loads to before the loop. We do this by looping over
+/// the stores in the loop, looking for stores to Must pointers which are
+/// loop invariant.
///
-void LICM::PromoteAliasSet(AliasSet &AS,
- SmallVectorImpl<BasicBlock*> &ExitBlocks,
- SmallVectorImpl<Instruction*> &InsertPts,
- PredIteratorCache &PIC) {
+bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
+ SmallVectorImpl<BasicBlock*>&ExitBlocks,
+ SmallVectorImpl<Instruction*>&InsertPts,
+ PredIteratorCache &PIC, LoopInfo *LI,
+ DominatorTree *DT, Loop *CurLoop,
+ AliasSetTracker *CurAST,
+ LICMSafetyInfo * SafetyInfo) {
+ // Verify inputs.
+ assert(LI != nullptr && DT != nullptr &&
+ CurLoop != nullptr && CurAST != nullptr &&
+ SafetyInfo != nullptr &&
+ "Unexpected Input to promoteLoopAccessesToScalars");
+ // Initially set Changed status to false.
+ bool Changed = false;
// We can promote this alias set if it has a store, if it is a "Must" alias
// set, if the pointer is loop invariant, and if we are not eliminating any
// volatile loads or stores.
if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
- return;
+ return Changed;
assert(!AS.empty() &&
"Must alias set should have at least one pointer element in it!");
+
Value *SomePtr = AS.begin()->getValue();
+ BasicBlock * Preheader = CurLoop->getLoopPreheader();
// It isn't safe to promote a load/store from the loop if the load/store is
// conditional. For example, turning:
@@ -810,6 +821,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
// us to prove better alignment.
unsigned Alignment = 1;
AAMDNodes AATags;
+ bool HasDedicatedExits = CurLoop->hasDedicatedExits();
// Check that all of the pointers in the alias set have the same type. We
// cannot (yet) promote a memory location that is loaded and stored in
@@ -822,7 +834,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
// cannot (yet) promote a memory location that is loaded and stored in
// different sizes.
if (SomePtr->getType() != ASIV->getType())
- return;
+ return Changed;
for (User *U : ASIV->users()) {
// Ignore instructions that are outside the loop.
@@ -835,7 +847,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
if (LoadInst *load = dyn_cast<LoadInst>(UI)) {
assert(!load->isVolatile() && "AST broken");
if (!load->isSimple())
- return;
+ return Changed;
} else if (StoreInst *store = dyn_cast<StoreInst>(UI)) {
// Stores *of* the pointer are not interesting, only stores *to* the
// pointer.
@@ -843,7 +855,14 @@ void LICM::PromoteAliasSet(AliasSet &AS,
continue;
assert(!store->isVolatile() && "AST broken");
if (!store->isSimple())
- return;
+ return Changed;
+ // Don't sink stores from loops without dedicated block exits. Exits
+ // containing indirect branches are not transformed by loop simplify,
+ // make sure we catch that. An additional load may be generated in the
+ // preheader for SSA updater, so also avoid sinking when no preheader
+ // is available.
+ if (!HasDedicatedExits || !Preheader)
+ return Changed;
// Note that we only check GuaranteedToExecute inside the store case
// so that we do not introduce stores where they did not exist before
@@ -855,16 +874,17 @@ void LICM::PromoteAliasSet(AliasSet &AS,
// Larger is better, with the exception of 0 being the best alignment.
unsigned InstAlignment = store->getAlignment();
if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0)
- if (isGuaranteedToExecute(*UI)) {
+ if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
GuaranteedToExecute = true;
Alignment = InstAlignment;
}
if (!GuaranteedToExecute)
- GuaranteedToExecute = isGuaranteedToExecute(*UI);
+ GuaranteedToExecute = isGuaranteedToExecute(*UI, DT,
+ CurLoop, SafetyInfo);
} else
- return; // Not a load or store.
+ return Changed; // Not a load or store.
// Merge the AA tags.
if (LoopUses.empty()) {
@@ -880,7 +900,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
// If there isn't a guaranteed-to-execute instruction, we can't promote.
if (!GuaranteedToExecute)
- return;
+ return Changed;
// Otherwise, this is safe to promote, lets do it!
DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');
@@ -925,10 +945,12 @@ void LICM::PromoteAliasSet(AliasSet &AS,
// If the SSAUpdater didn't use the load in the preheader, just zap it now.
if (PreheaderLoad->use_empty())
PreheaderLoad->eraseFromParent();
-}
+ return Changed;
+}
-/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+/// Simple Analysis hook. Clone alias set info.
+///
void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
if (!AST)
@@ -937,8 +959,8 @@ void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
AST->copyValue(From, To);
}
-/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
-/// set.
+/// Simple Analysis hook. Delete value V from alias set
+///
void LICM::deleteAnalysisValue(Value *V, Loop *L) {
AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
if (!AST)
@@ -948,6 +970,7 @@ void LICM::deleteAnalysisValue(Value *V, Loop *L) {
}
/// Simple Analysis hook. Delete value L from alias set map.
+///
void LICM::deleteAnalysisLoop(Loop *L) {
AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
if (!AST)
@@ -956,3 +979,23 @@ void LICM::deleteAnalysisLoop(Loop *L) {
delete AST;
LoopToAliasSetMap.erase(L);
}
+
+
+/// Return true if the body of this loop may store into the memory
+/// location pointed to by V.
+///
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+ const AAMDNodes &AAInfo,
+ AliasSetTracker *CurAST) {
+ // Check to see if any of the basic blocks in CurLoop invalidate *V.
+ return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
+}
+
+/// Little predicate that returns true if the specified basic block is in
+/// a subloop of the current one, not the current one itself.
+///
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
+ assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+ return LI->getLoopFor(BB) != CurLoop;
+}
+
diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt
index 2bb49a3..deea9e2 100644
--- a/lib/Transforms/Scalar/LLVMBuild.txt
+++ b/lib/Transforms/Scalar/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
name = Scalar
parent = Transforms
library_name = ScalarOpts
-required_libraries = Analysis Core InstCombine ProfileData Support Target TransformUtils
+required_libraries = Analysis Core InstCombine ProfileData Support TransformUtils
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 1d1f33a..98b068e 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -39,14 +39,14 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<ScalarEvolution>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addPreserved<ScalarEvolution>();
AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LoopInfo>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreservedID(LoopSimplifyID);
AU.addPreservedID(LCSSAID);
}
@@ -63,7 +63,7 @@ char LoopDeletion::ID = 0;
INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
"Delete dead loops", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -236,7 +236,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
// Finally, the blocks from loopinfo. This has to happen late because
// otherwise our loop iterators won't work.
- LoopInfo &loopInfo = getAnalysis<LoopInfo>();
+ LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SmallPtrSet<BasicBlock*, 8> blocks;
blocks.insert(L->block_begin(), L->block_end());
for (BasicBlock *BB : blocks)
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a12f5a7..243c624 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -56,7 +56,7 @@
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -163,8 +163,8 @@ namespace {
/// loop preheaders be inserted into the CFG.
///
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LoopInfo>();
- AU.addPreserved<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequiredID(LoopSimplifyID);
AU.addPreservedID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
@@ -175,8 +175,8 @@ namespace {
AU.addPreserved<ScalarEvolution>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfo>();
- AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
}
const DataLayout *getDataLayout() {
@@ -197,11 +197,16 @@ namespace {
}
TargetLibraryInfo *getTargetLibraryInfo() {
- return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>());
+ if (!TLI)
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+ return TLI;
}
const TargetTransformInfo *getTargetTransformInfo() {
- return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>());
+ return TTI ? TTI
+ : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *CurLoop->getHeader()->getParent()));
}
Loop *getLoop() const { return CurLoop; }
@@ -215,14 +220,14 @@ namespace {
char LoopIdiomRecognize::ID = 0;
INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
false, false)
@@ -232,44 +237,13 @@ Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
/// and zero out all the operands of this instruction. If any of them become
/// dead, delete them and the computation tree that feeds them.
///
-static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
+static void deleteDeadInstruction(Instruction *I,
const TargetLibraryInfo *TLI) {
- SmallVector<Instruction*, 32> NowDeadInsts;
-
- NowDeadInsts.push_back(I);
-
- // Before we touch this instruction, remove it from SE!
- do {
- Instruction *DeadInst = NowDeadInsts.pop_back_val();
-
- // This instruction is dead, zap it, in stages. Start by removing it from
- // SCEV.
- SE.forgetValue(DeadInst);
-
- for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
- Value *Op = DeadInst->getOperand(op);
- DeadInst->setOperand(op, nullptr);
-
- // If this operand just became dead, add it to the NowDeadInsts list.
- if (!Op->use_empty()) continue;
-
- if (Instruction *OpI = dyn_cast<Instruction>(Op))
- if (isInstructionTriviallyDead(OpI, TLI))
- NowDeadInsts.push_back(OpI);
- }
-
- DeadInst->eraseFromParent();
-
- } while (!NowDeadInsts.empty());
-}
-
-/// deleteIfDeadInstruction - If the specified value is a dead instruction,
-/// delete it and any recursively used instructions.
-static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
- const TargetLibraryInfo *TLI) {
- if (Instruction *I = dyn_cast<Instruction>(V))
- if (isInstructionTriviallyDead(I, TLI))
- deleteDeadInstruction(I, SE, TLI);
+ SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end());
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ I->eraseFromParent();
+ for (Value *Op : Operands)
+ RecursivelyDeleteTriviallyDeadInstructions(Op, TLI);
}
//===----------------------------------------------------------------------===//
@@ -285,7 +259,7 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
// the concern of breaking data dependence.
bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
if (BranchInst *Br = getBranch(BB)) {
- return Br->isUnconditional() && BB->size() == 1;
+ return Br->isUnconditional() && Br == BB->begin();
}
return false;
}
@@ -542,7 +516,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
PreCond->replaceAllUsesWith(NewPreCond);
- deleteDeadInstruction(PreCond, *SE, TLI);
+ RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
}
// Step 3: Note that the population count is exactly the trip count of the
@@ -592,15 +566,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
// Step 4: All the references to the original population counter outside
// the loop are replaced with the NewCount -- the value returned from
// __builtin_ctpop().
- {
- SmallVector<Value *, 4> CntUses;
- for (User *U : CntInst->users())
- if (cast<Instruction>(U)->getParent() != Body)
- CntUses.push_back(U);
- for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) {
- (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount);
- }
- }
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
// step 5: Forget the "non-computable" trip-count SCEV associated with the
// loop. The loop would otherwise not be deleted even if it becomes empty.
@@ -666,8 +632,8 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
// set DT
(void)getDominatorTree();
- LoopInfo &LI = getAnalysis<LoopInfo>();
- TLI = &getAnalysis<TargetLibraryInfo>();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
// set TLI
(void)getTargetLibraryInfo();
@@ -997,7 +963,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) {
Expander.clear();
// If we generated new code for the base pointer, clean up.
- deleteIfDeadInstruction(BasePtr, *SE, TLI);
+ RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
return false;
}
@@ -1053,7 +1019,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
// Okay, the memset has been formed. Zap the original store and anything that
// feeds into it.
- deleteDeadInstruction(TheStore, *SE, TLI);
+ deleteDeadInstruction(TheStore, TLI);
++NumMemSet;
return true;
}
@@ -1094,7 +1060,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
getAnalysis<AliasAnalysis>(), SI)) {
Expander.clear();
// If we generated new code for the base pointer, clean up.
- deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
+ RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
return false;
}
@@ -1109,8 +1075,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
Expander.clear();
// If we generated new code for the base pointer, clean up.
- deleteIfDeadInstruction(LoadBasePtr, *SE, TLI);
- deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
+ RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
+ RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
return false;
}
@@ -1143,7 +1109,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
// Okay, the memset has been formed. Zap the original store and anything that
// feeds into it.
- deleteDeadInstruction(SI, *SE, TLI);
+ deleteDeadInstruction(SI, TLI);
++NumMemCpy;
return true;
}
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 8fd7c8f..6dc600e 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -14,15 +14,16 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -42,13 +43,13 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<AssumptionTracker>();
- AU.addRequired<LoopInfo>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addRequiredID(LoopSimplifyID);
AU.addPreservedID(LoopSimplifyID);
AU.addPreservedID(LCSSAID);
- AU.addPreserved("scalar-evolution");
- AU.addRequired<TargetLibraryInfo>();
+ AU.addPreserved<ScalarEvolution>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
};
}
@@ -56,10 +57,10 @@ namespace {
char LoopInstSimplify::ID = 0;
INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
"Simplify instructions in loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify",
"Simplify instructions in loops", false, false)
@@ -75,11 +76,13 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
DominatorTreeWrapperPass *DTWP =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- LoopInfo *LI = &getAnalysis<LoopInfo>();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
- const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
- AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
SmallVector<BasicBlock*, 8> ExitBlocks;
L->getUniqueExitBlocks(ExitBlocks);
@@ -120,7 +123,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
// Don't bother simplifying unused instructions.
if (!I->use_empty()) {
- Value *V = SimplifyInstruction(I, DL, TLI, DT, AT);
+ Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC);
if (V && LI->replacementPreservesLCSSAForm(I, V)) {
// Mark all uses for resimplification next time round the loop.
for (User *U : I->users())
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index 8f12204..fdf7e3b 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -12,7 +12,9 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -28,7 +30,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -43,6 +45,12 @@ static cl::opt<unsigned>
MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
cl::desc("The maximum increment for loop rerolling"));
+static cl::opt<unsigned>
+NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
+ cl::Hidden,
+ cl::desc("The maximum number of failures to tolerate"
+ " during fuzzy matching. (default: 400)"));
+
// This loop re-rolling transformation aims to transform loops like this:
//
// int foo(int a);
@@ -119,6 +127,16 @@ MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
// br %cmp, header, exit
namespace {
+ enum IterationLimits {
+ /// The maximum number of iterations that we'll try and reroll. This
+ /// has to be less than 25 in order to fit into a SmallBitVector.
+ IL_MaxRerollIterations = 16,
+ /// The bitvector index used by loop induction variables and other
+ /// instructions that belong to all iterations.
+ IL_All,
+ IL_End
+ };
+
class LoopReroll : public LoopPass {
public:
static char ID; // Pass ID, replacement for typeid
@@ -130,15 +148,15 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AliasAnalysis>();
- AU.addRequired<LoopInfo>();
- AU.addPreserved<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolution>();
- AU.addRequired<TargetLibraryInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
-protected:
+ protected:
AliasAnalysis *AA;
LoopInfo *LI;
ScalarEvolution *SE;
@@ -311,26 +329,116 @@ protected:
DenseSet<int> Reds;
};
+ // A DAGRootSet models an induction variable being used in a rerollable
+ // loop. For example,
+ //
+ // x[i*3+0] = y1
+ // x[i*3+1] = y2
+ // x[i*3+2] = y3
+ //
+ // Base instruction -> i*3
+ // +---+----+
+ // / | \
+ // ST[y1] +1 +2 <-- Roots
+ // | |
+ // ST[y2] ST[y3]
+ //
+ // There may be multiple DAGRoots, for example:
+ //
+ // x[i*2+0] = ... (1)
+ // x[i*2+1] = ... (1)
+ // x[i*2+4] = ... (2)
+ // x[i*2+5] = ... (2)
+ // x[(i+1234)*2+5678] = ... (3)
+ // x[(i+1234)*2+5679] = ... (3)
+ //
+ // The loop will be rerolled by adding a new loop induction variable,
+ // one for the Base instruction in each DAGRootSet.
+ //
+ struct DAGRootSet {
+ Instruction *BaseInst;
+ SmallInstructionVector Roots;
+ // The instructions between IV and BaseInst (but not including BaseInst).
+ SmallInstructionSet SubsumedInsts;
+ };
+
+ // The set of all DAG roots, and state tracking of all roots
+ // for a particular induction variable.
+ struct DAGRootTracker {
+ DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
+ ScalarEvolution *SE, AliasAnalysis *AA,
+ TargetLibraryInfo *TLI, const DataLayout *DL)
+ : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI),
+ DL(DL), IV(IV) {
+ }
+
+ /// Stage 1: Find all the DAG roots for the induction variable.
+ bool findRoots();
+ /// Stage 2: Validate if the found roots are valid.
+ bool validate(ReductionTracker &Reductions);
+ /// Stage 3: Assuming validate() returned true, perform the
+ /// replacement.
+ /// @param IterCount The maximum iteration count of L.
+ void replace(const SCEV *IterCount);
+
+ protected:
+ typedef MapVector<Instruction*, SmallBitVector> UsesTy;
+
+ bool findRootsRecursive(Instruction *IVU,
+ SmallInstructionSet SubsumedInsts);
+ bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
+ bool collectPossibleRoots(Instruction *Base,
+ std::map<int64_t,Instruction*> &Roots);
+
+ bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
+ void collectInLoopUserSet(const SmallInstructionVector &Roots,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users);
+ void collectInLoopUserSet(Instruction *Root,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users);
+
+ UsesTy::iterator nextInstr(int Val, UsesTy &In,
+ const SmallInstructionSet &Exclude,
+ UsesTy::iterator *StartI=nullptr);
+ bool isBaseInst(Instruction *I);
+ bool isRootInst(Instruction *I);
+ bool instrDependsOn(Instruction *I,
+ UsesTy::iterator Start,
+ UsesTy::iterator End);
+
+ LoopReroll *Parent;
+
+ // Members of Parent, replicated here for brevity.
+ Loop *L;
+ ScalarEvolution *SE;
+ AliasAnalysis *AA;
+ TargetLibraryInfo *TLI;
+ const DataLayout *DL;
+
+ // The loop induction variable.
+ Instruction *IV;
+ // Loop step amount.
+ uint64_t Inc;
+ // Loop reroll count; if Inc == 1, this records the scaling applied
+ // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
+ // If Inc is not 1, Scale = Inc.
+ uint64_t Scale;
+ // The roots themselves.
+ SmallVector<DAGRootSet,16> RootSets;
+ // All increment instructions for IV.
+ SmallInstructionVector LoopIncs;
+ // Map of all instructions in the loop (in order) to the iterations
+ // they are used in (or specially, IL_All for instructions
+ // used in the loop increment mechanism).
+ UsesTy Uses;
+ };
+
void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
void collectPossibleReductions(Loop *L,
ReductionTracker &Reductions);
- void collectInLoopUserSet(Loop *L,
- const SmallInstructionVector &Roots,
- const SmallInstructionSet &Exclude,
- const SmallInstructionSet &Final,
- DenseSet<Instruction *> &Users);
- void collectInLoopUserSet(Loop *L,
- Instruction * Root,
- const SmallInstructionSet &Exclude,
- const SmallInstructionSet &Final,
- DenseSet<Instruction *> &Users);
- bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
- Instruction *&IV,
- SmallInstructionVector &LoopIncs);
- bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV,
- SmallVector<SmallInstructionVector, 32> &Roots,
- SmallInstructionSet &AllRoots,
- SmallInstructionVector &LoopIncs);
bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
ReductionTracker &Reductions);
};
@@ -339,10 +447,10 @@ protected:
char LoopReroll::ID = 0;
INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
Pass *llvm::createLoopRerollPass() {
@@ -353,10 +461,10 @@ Pass *llvm::createLoopRerollPass() {
// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
// non-loop blocks to be outside the loop.
static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
- for (User *U : I->users())
+ for (User *U : I->users()) {
if (!L->contains(cast<Instruction>(U)))
return true;
-
+ }
return false;
}
@@ -403,6 +511,8 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) {
// (including the PHI), except for the last value (which is used by the PHI
// and also outside the loop).
Instruction *C = Instructions.front();
+ if (C->user_empty())
+ return;
do {
C = cast<Instruction>(*C->user_begin());
@@ -424,11 +534,12 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) {
return;
// C is now the (potential) last instruction in the reduction chain.
- for (User *U : C->users())
+ for (User *U : C->users()) {
// The only in-loop user can be the initial PHI.
if (L->contains(cast<Instruction>(U)))
if (cast<Instruction>(U) != Instructions.front())
return;
+ }
Instructions.push_back(C);
Valid = true;
@@ -467,7 +578,7 @@ void LoopReroll::collectPossibleReductions(Loop *L,
// if they are users, but their users are not added. This is used, for
// example, to prevent a reduction update from forcing all later reduction
// updates into the use set.
-void LoopReroll::collectInLoopUserSet(Loop *L,
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
Instruction *Root, const SmallInstructionSet &Exclude,
const SmallInstructionSet &Final,
DenseSet<Instruction *> &Users) {
@@ -504,14 +615,14 @@ void LoopReroll::collectInLoopUserSet(Loop *L,
// Collect all of the users of all of the provided root instructions (combined
// into a single set).
-void LoopReroll::collectInLoopUserSet(Loop *L,
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
const SmallInstructionVector &Roots,
const SmallInstructionSet &Exclude,
const SmallInstructionSet &Final,
DenseSet<Instruction *> &Users) {
for (SmallInstructionVector::const_iterator I = Roots.begin(),
IE = Roots.end(); I != IE; ++I)
- collectInLoopUserSet(L, *I, Exclude, Final, Users);
+ collectInLoopUserSet(*I, Exclude, Final, Users);
}
static bool isSimpleLoadStore(Instruction *I) {
@@ -524,289 +635,372 @@ static bool isSimpleLoadStore(Instruction *I) {
return false;
}
-// Recognize loops that are setup like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// %scaled.iv = mul %iv, scale
-// f(%scaled.iv)
-// %scaled.iv.1 = add %scaled.iv, 1
-// f(%scaled.iv.1)
-// %scaled.iv.2 = add %scaled.iv, 2
-// f(%scaled.iv.2)
-// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
-// f(%scaled.iv.scale_m_1)
-// ...
-// %iv.next = add %iv, 1
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs.
-bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
- Instruction *&IV,
- SmallInstructionVector &LoopIncs) {
- // This is a special case: here we're looking for all uses (except for
- // the increment) to be multiplied by a common factor. The increment must
- // be by one. This is to capture loops like:
- // for (int i = 0; i < 500; ++i) {
- // foo(3*i); foo(3*i+1); foo(3*i+2);
- // }
- if (RealIV->getNumUses() != 2)
- return false;
- const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV));
- Instruction *User1 = cast<Instruction>(*RealIV->user_begin()),
- *User2 = cast<Instruction>(*std::next(RealIV->user_begin()));
- if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType()))
- return false;
- const SCEVAddRecExpr *User1SCEV =
- dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)),
- *User2SCEV =
- dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2));
- if (!User1SCEV || !User1SCEV->isAffine() ||
- !User2SCEV || !User2SCEV->isAffine())
+/// Return true if IVU is a "simple" arithmetic operation.
+/// This is used for narrowing the search space for DAGRoots; only arithmetic
+/// and GEPs can be part of a DAGRoot.
+static bool isSimpleArithmeticOp(User *IVU) {
+ if (Instruction *I = dyn_cast<Instruction>(IVU)) {
+ switch (I->getOpcode()) {
+ default: return false;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::Shl:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::GetElementPtr:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool isLoopIncrement(User *U, Instruction *IV) {
+ BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
+ if (!BO || BO->getOpcode() != Instruction::Add)
return false;
- // We assume below that User1 is the scale multiply and User2 is the
- // increment. If this can't be true, then swap them.
- if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) {
- std::swap(User1, User2);
- std::swap(User1SCEV, User2SCEV);
+ for (auto *UU : BO->users()) {
+ PHINode *PN = dyn_cast<PHINode>(UU);
+ if (PN && PN == IV)
+ return true;
}
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::
+collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
+ SmallInstructionVector BaseUsers;
+
+ for (auto *I : Base->users()) {
+ ConstantInt *CI = nullptr;
+
+ if (isLoopIncrement(I, IV)) {
+ LoopIncs.push_back(cast<Instruction>(I));
+ continue;
+ }
+
+ // The root nodes must be either GEPs, ORs or ADDs.
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ if (BO->getOpcode() == Instruction::Add ||
+ BO->getOpcode() == Instruction::Or)
+ CI = dyn_cast<ConstantInt>(BO->getOperand(1));
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
+ CI = dyn_cast<ConstantInt>(LastOperand);
+ }
+
+ if (!CI) {
+ if (Instruction *II = dyn_cast<Instruction>(I)) {
+ BaseUsers.push_back(II);
+ continue;
+ } else {
+ DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
+ return false;
+ }
+ }
+
+ int64_t V = CI->getValue().getSExtValue();
+ if (Roots.find(V) != Roots.end())
+ // No duplicates, please.
+ return false;
- if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE))
+ // FIXME: Add support for negative values.
+ if (V < 0) {
+ DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n");
+ return false;
+ }
+
+ Roots[V] = cast<Instruction>(I);
+ }
+
+ if (Roots.empty())
return false;
- assert(User2SCEV->getStepRecurrence(*SE)->isOne() &&
- "Invalid non-unit step for multiplicative scaling");
- LoopIncs.push_back(User2);
-
- if (const SCEVConstant *MulScale =
- dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) {
- // Make sure that both the start and step have the same multiplier.
- if (RealIVSCEV->getStart()->getType() != MulScale->getType())
+
+ // If we found non-loop-inc, non-root users of Base, assume they are
+ // for the zeroth root index. This is because "add %a, 0" gets optimized
+ // away.
+ if (BaseUsers.size()) {
+ if (Roots.find(0) != Roots.end()) {
+ DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
return false;
- if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) !=
- User1SCEV->getStart())
+ }
+ Roots[0] = Base;
+ }
+
+ // Calculate the number of users of the base, or lowest indexed, iteration.
+ unsigned NumBaseUses = BaseUsers.size();
+ if (NumBaseUses == 0)
+ NumBaseUses = Roots.begin()->second->getNumUses();
+
+ // Check that every node has the same number of users.
+ for (auto &KV : Roots) {
+ if (KV.first == 0)
+ continue;
+ if (KV.second->getNumUses() != NumBaseUses) {
+ DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+ << "#Base=" << NumBaseUses << ", #Root=" <<
+ KV.second->getNumUses() << "\n");
return false;
+ }
+ }
+
+ return true;
+}
- ConstantInt *MulScaleCI = MulScale->getValue();
- if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc))
+bool LoopReroll::DAGRootTracker::
+findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
+ // Does the user look like it could be part of a root set?
+ // All its users must be simple arithmetic ops.
+ if (I->getNumUses() > IL_MaxRerollIterations)
+ return false;
+
+ if ((I->getOpcode() == Instruction::Mul ||
+ I->getOpcode() == Instruction::PHI) &&
+ I != IV &&
+ findRootsBase(I, SubsumedInsts))
+ return true;
+
+ SubsumedInsts.insert(I);
+
+ for (User *V : I->users()) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end())
+ continue;
+
+ if (!I || !isSimpleArithmeticOp(I) ||
+ !findRootsRecursive(I, SubsumedInsts))
return false;
- Scale = MulScaleCI->getZExtValue();
- IV = User1;
- } else
+ }
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
+
+ // The base instruction needs to be a multiply so
+ // that we can erase it.
+ if (IVU->getOpcode() != Instruction::Mul &&
+ IVU->getOpcode() != Instruction::PHI)
return false;
- DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n");
+ std::map<int64_t, Instruction*> V;
+ if (!collectPossibleRoots(IVU, V))
+ return false;
+
+ // If we didn't get a root for index zero, then IVU must be
+ // subsumed.
+ if (V.find(0) == V.end())
+ SubsumedInsts.insert(IVU);
+
+ // Partition the vector into monotonically increasing indexes.
+ DAGRootSet DRS;
+ DRS.BaseInst = nullptr;
+
+ for (auto &KV : V) {
+ if (!DRS.BaseInst) {
+ DRS.BaseInst = KV.second;
+ DRS.SubsumedInsts = SubsumedInsts;
+ } else if (DRS.Roots.empty()) {
+ DRS.Roots.push_back(KV.second);
+ } else if (V.find(KV.first - 1) != V.end()) {
+ DRS.Roots.push_back(KV.second);
+ } else {
+ // Linear sequence terminated.
+ RootSets.push_back(DRS);
+ DRS.BaseInst = KV.second;
+ DRS.SubsumedInsts = SubsumedInsts;
+ DRS.Roots.clear();
+ }
+ }
+ RootSets.push_back(DRS);
+
return true;
}
-// Collect all root increments with respect to the provided induction variable
-// (normally the PHI, but sometimes a multiply). A root increment is an
-// instruction, normally an add, with a positive constant less than Scale. In a
-// rerollable loop, each of these increments is the root of an instruction
-// graph isomorphic to the others. Also, we collect the final induction
-// increment (the increment equal to the Scale), and its users in LoopIncs.
-bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale,
- Instruction *IV,
- SmallVector<SmallInstructionVector, 32> &Roots,
- SmallInstructionSet &AllRoots,
- SmallInstructionVector &LoopIncs) {
- for (User *U : IV->users()) {
- Instruction *UI = cast<Instruction>(U);
- if (!SE->isSCEVable(UI->getType()))
- continue;
- if (UI->getType() != IV->getType())
- continue;
- if (!L->contains(UI))
- continue;
- if (hasUsesOutsideLoop(UI, L))
- continue;
+bool LoopReroll::DAGRootTracker::findRoots() {
- if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV(
- SE->getSCEV(UI), SE->getSCEV(IV)))) {
- uint64_t Idx = Diff->getValue()->getValue().getZExtValue();
- if (Idx > 0 && Idx < Scale) {
- Roots[Idx-1].push_back(UI);
- AllRoots.insert(UI);
- } else if (Idx == Scale && Inc > 1) {
- LoopIncs.push_back(UI);
- }
+ const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV));
+ Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))->
+ getValue()->getZExtValue();
+
+ assert(RootSets.empty() && "Unclean state!");
+ if (Inc == 1) {
+ for (auto *IVU : IV->users()) {
+ if (isLoopIncrement(IVU, IV))
+ LoopIncs.push_back(cast<Instruction>(IVU));
}
+ if (!findRootsRecursive(IV, SmallInstructionSet()))
+ return false;
+ LoopIncs.push_back(IV);
+ } else {
+ if (!findRootsBase(IV, SmallInstructionSet()))
+ return false;
}
- if (Roots[0].empty())
+ // Ensure all sets have the same size.
+ if (RootSets.empty()) {
+ DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
return false;
- bool AllSame = true;
- for (unsigned i = 1; i < Scale-1; ++i)
- if (Roots[i].size() != Roots[0].size()) {
- AllSame = false;
- break;
+ }
+ for (auto &V : RootSets) {
+ if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
+ DEBUG(dbgs()
+ << "LRR: Aborting because not all root sets have the same size\n");
+ return false;
}
+ }
- if (!AllSame)
+ // And ensure all loop iterations are consecutive. We rely on std::map
+ // providing ordered traversal.
+ for (auto &V : RootSets) {
+ const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst));
+ if (!ADR)
+ return false;
+
+ // Consider a DAGRootSet with N-1 roots (so N different values including
+ // BaseInst).
+ // Define d = Roots[0] - BaseInst, which should be the same as
+ // Roots[I] - Roots[I-1] for all I in [1..N).
+ // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
+ // loop iteration J.
+ //
+ // Now, For the loop iterations to be consecutive:
+ // D = d * N
+
+ unsigned N = V.Roots.size() + 1;
+ const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR);
+ const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
+ if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) {
+ DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n");
+ return false;
+ }
+ }
+ Scale = RootSets[0].Roots.size() + 1;
+
+ if (Scale > IL_MaxRerollIterations) {
+ DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+ << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
+ << "\n");
return false;
+ }
+
+ DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
return true;
}
-// Validate the selected reductions. All iterations must have an isomorphic
-// part of the reduction chain and, for non-associative reductions, the chain
-// entries must appear in order.
-bool LoopReroll::ReductionTracker::validateSelected() {
- // For a non-associative reduction, the chain entries must appear in order.
- for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
- RI != RIE; ++RI) {
- int i = *RI;
- int PrevIter = 0, BaseCount = 0, Count = 0;
- for (Instruction *J : PossibleReds[i]) {
- // Note that all instructions in the chain must have been found because
- // all instructions in the function must have been assigned to some
- // iteration.
- int Iter = PossibleRedIter[J];
- if (Iter != PrevIter && Iter != PrevIter + 1 &&
- !PossibleReds[i].getReducedValue()->isAssociative()) {
- DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
- J << "\n");
- return false;
- }
+bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
+ // Populate the MapVector with all instructions in the block, in order first,
+ // so we can iterate over the contents later in perfect order.
+ for (auto &I : *L->getHeader()) {
+ Uses[&I].resize(IL_End);
+ }
- if (Iter != PrevIter) {
- if (Count != BaseCount) {
- DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
- " reduction use count " << Count <<
- " is not equal to the base use count " <<
- BaseCount << "\n");
- return false;
- }
+ SmallInstructionSet Exclude;
+ for (auto &DRS : RootSets) {
+ Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+ Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+ Exclude.insert(DRS.BaseInst);
+ }
+ Exclude.insert(LoopIncs.begin(), LoopIncs.end());
- Count = 0;
+ for (auto &DRS : RootSets) {
+ DenseSet<Instruction*> VBase;
+ collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
+ for (auto *I : VBase) {
+ Uses[I].set(0);
+ }
+
+ unsigned Idx = 1;
+ for (auto *Root : DRS.Roots) {
+ DenseSet<Instruction*> V;
+ collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
+
+ // While we're here, check the use sets are the same size.
+ if (V.size() != VBase.size()) {
+ DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+ return false;
}
- ++Count;
- if (Iter == 0)
- ++BaseCount;
+ for (auto *I : V) {
+ Uses[I].set(Idx);
+ }
+ ++Idx;
+ }
- PrevIter = Iter;
+ // Make sure our subsumed instructions are remembered too.
+ for (auto *I : DRS.SubsumedInsts) {
+ Uses[I].set(IL_All);
}
}
- return true;
-}
-
-// For all selected reductions, remove all parts except those in the first
-// iteration (and the PHI). Replace outside uses of the reduced value with uses
-// of the first-iteration reduced value (in other words, reroll the selected
-// reductions).
-void LoopReroll::ReductionTracker::replaceSelected() {
- // Fixup reductions to refer to the last instruction associated with the
- // first iteration (not the last).
- for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
- RI != RIE; ++RI) {
- int i = *RI;
- int j = 0;
- for (int e = PossibleReds[i].size(); j != e; ++j)
- if (PossibleRedIter[PossibleReds[i][j]] != 0) {
- --j;
- break;
- }
+ // Make sure the loop increments are also accounted for.
- // Replace users with the new end-of-chain value.
- SmallInstructionVector Users;
- for (User *U : PossibleReds[i].getReducedValue()->users())
- Users.push_back(cast<Instruction>(U));
+ Exclude.clear();
+ for (auto &DRS : RootSets) {
+ Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+ Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+ Exclude.insert(DRS.BaseInst);
+ }
- for (SmallInstructionVector::iterator J = Users.begin(),
- JE = Users.end(); J != JE; ++J)
- (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
- PossibleReds[i][j]);
+ DenseSet<Instruction*> V;
+ collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
+ for (auto *I : V) {
+ Uses[I].set(IL_All);
}
-}
-// Reroll the provided loop with respect to the provided induction variable.
-// Generally, we're looking for a loop like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// f(%iv)
-// %iv.1 = add %iv, 1 <-- a root increment
-// f(%iv.1)
-// %iv.2 = add %iv, 2 <-- a root increment
-// f(%iv.2)
-// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
-// f(%iv.scale_m_1)
-// ...
-// %iv.next = add %iv, scale
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
-// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
-// be intermixed with eachother. The restriction imposed by this algorithm is
-// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
-// etc. be the same.
-//
-// First, we collect the use set of %iv, excluding the other increment roots.
-// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
-// times, having collected the use set of f(%iv.(i+1)), during which we:
-// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
-// the next unmatched instruction in f(%iv.(i+1)).
-// - Ensure that both matched instructions don't have any external users
-// (with the exception of last-in-chain reduction instructions).
-// - Track the (aliasing) write set, and other side effects, of all
-// instructions that belong to future iterations that come before the matched
-// instructions. If the matched instructions read from that write set, then
-// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
-// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
-// if any of these future instructions had side effects (could not be
-// speculatively executed), and so do the matched instructions, when we
-// cannot reorder those side-effect-producing instructions, and rerolling
-// fails.
-//
-// Finally, we make sure that all loop instructions are either loop increment
-// roots, belong to simple latch code, parts of validated reductions, part of
-// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
-// have been validated), then we reroll the loop.
-bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
- const SCEV *IterCount,
- ReductionTracker &Reductions) {
- const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV));
- uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))->
- getValue()->getZExtValue();
- // The collection of loop increment instructions.
- SmallInstructionVector LoopIncs;
- uint64_t Scale = Inc;
-
- // The effective induction variable, IV, is normally also the real induction
- // variable. When we're dealing with a loop like:
- // for (int i = 0; i < 500; ++i)
- // x[3*i] = ...;
- // x[3*i+1] = ...;
- // x[3*i+2] = ...;
- // then the real IV is still i, but the effective IV is (3*i).
- Instruction *RealIV = IV;
- if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs))
- return false;
+ return true;
- assert(Scale <= MaxInc && "Scale is too large");
- assert(Scale > 1 && "Scale must be at least 2");
+}
- // The set of increment instructions for each increment value.
- SmallVector<SmallInstructionVector, 32> Roots(Scale-1);
- SmallInstructionSet AllRoots;
- if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs))
- return false;
+/// Get the next instruction in "In" that is a member of set Val.
+/// Start searching from StartI, and do not return anything in Exclude.
+/// If StartI is not given, start from In.begin().
+LoopReroll::DAGRootTracker::UsesTy::iterator
+LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
+ const SmallInstructionSet &Exclude,
+ UsesTy::iterator *StartI) {
+ UsesTy::iterator I = StartI ? *StartI : In.begin();
+ while (I != In.end() && (I->second.test(Val) == 0 ||
+ Exclude.count(I->first) != 0))
+ ++I;
+ return I;
+}
- DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
- *RealIV << "\n");
+bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
+ for (auto &DRS : RootSets) {
+ if (DRS.BaseInst == I)
+ return true;
+ }
+ return false;
+}
- // An array of just the possible reductions for this scale factor. When we
- // collect the set of all users of some root instructions, these reduction
- // instructions are treated as 'final' (their uses are not considered).
- // This is important because we don't want the root use set to search down
- // the reduction chain.
- SmallInstructionSet PossibleRedSet;
- SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet;
- Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet,
- PossibleRedLastSet);
+bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
+ for (auto &DRS : RootSets) {
+ if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end())
+ return true;
+ }
+ return false;
+}
+/// Return true if instruction I depends on any instruction between
+/// Start and End.
+bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
+ UsesTy::iterator Start,
+ UsesTy::iterator End) {
+ for (auto *U : I->users()) {
+ for (auto It = Start; It != End; ++It)
+ if (U == It->first)
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
// We now need to check for equivalence of the use graph of each root with
// that of the primary induction variable (excluding the roots). Our goal
// here is not to solve the full graph isomorphism problem, but rather to
@@ -815,121 +1009,167 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
// is the same (although we will not make an assumption about how the
// different iterations are intermixed). Note that while the order must be
// the same, the instructions may not be in the same basic block.
- SmallInstructionSet Exclude(AllRoots);
- Exclude.insert(LoopIncs.begin(), LoopIncs.end());
- DenseSet<Instruction *> BaseUseSet;
- collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet);
+ // An array of just the possible reductions for this scale factor. When we
+ // collect the set of all users of some root instructions, these reduction
+ // instructions are treated as 'final' (their uses are not considered).
+ // This is important because we don't want the root use set to search down
+ // the reduction chain.
+ SmallInstructionSet PossibleRedSet;
+ SmallInstructionSet PossibleRedLastSet;
+ SmallInstructionSet PossibleRedPHISet;
+ Reductions.restrictToScale(Scale, PossibleRedSet,
+ PossibleRedPHISet, PossibleRedLastSet);
- DenseSet<Instruction *> AllRootUses;
- std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1);
+ // Populate "Uses" with where each instruction is used.
+ if (!collectUsedInstructions(PossibleRedSet))
+ return false;
- bool MatchFailed = false;
- for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) {
- DenseSet<Instruction *> &RootUseSet = RootUseSets[i];
- collectInLoopUserSet(L, Roots[i], SmallInstructionSet(),
- PossibleRedSet, RootUseSet);
+ // Make sure we mark the reduction PHIs as used in all iterations.
+ for (auto *I : PossibleRedPHISet) {
+ Uses[I].set(IL_All);
+ }
- DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() <<
- " vs. iteration increment " << (i+1) <<
- " use set size: " << RootUseSet.size() << "\n");
+ // Make sure all instructions in the loop are in one and only one
+ // set.
+ for (auto &KV : Uses) {
+ if (KV.second.count() != 1) {
+ DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+ << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+ return false;
+ }
+ }
- if (BaseUseSet.size() != RootUseSet.size()) {
- MatchFailed = true;
- break;
+ DEBUG(
+ for (auto &KV : Uses) {
+ dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
}
+ );
+ for (unsigned Iter = 1; Iter < Scale; ++Iter) {
// In addition to regular aliasing information, we need to look for
// instructions from later (future) iterations that have side effects
// preventing us from reordering them past other instructions with side
// effects.
bool FutureSideEffects = false;
AliasSetTracker AST(*AA);
-
// The map between instructions in f(%iv.(i+1)) and f(%iv).
DenseMap<Value *, Value *> BaseMap;
- assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops");
- for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(),
- JE = Header->end(); J1 != JE && !MatchFailed; ++J1) {
- if (cast<Instruction>(J1) == RealIV)
- continue;
- if (cast<Instruction>(J1) == IV)
- continue;
- if (!BaseUseSet.count(J1))
- continue;
- if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs.
- continue;
-
- while (J2 != JE && (!RootUseSet.count(J2) ||
- std::find(Roots[i].begin(), Roots[i].end(), J2) !=
- Roots[i].end())) {
- // As we iterate through the instructions, instructions that don't
- // belong to previous iterations (or the base case), must belong to
- // future iterations. We want to track the alias set of writes from
- // previous iterations.
- if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) &&
- !AllRootUses.count(J2)) {
- if (J2->mayWriteToMemory())
- AST.add(J2);
-
- // Note: This is specifically guarded by a check on isa<PHINode>,
- // which while a valid (somewhat arbitrary) micro-optimization, is
- // needed because otherwise isSafeToSpeculativelyExecute returns
- // false on PHI nodes.
- if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
- FutureSideEffects = true;
+ // Compare iteration Iter to the base.
+ SmallInstructionSet Visited;
+ auto BaseIt = nextInstr(0, Uses, Visited);
+ auto RootIt = nextInstr(Iter, Uses, Visited);
+ auto LastRootIt = Uses.begin();
+
+ while (BaseIt != Uses.end() && RootIt != Uses.end()) {
+ Instruction *BaseInst = BaseIt->first;
+ Instruction *RootInst = RootIt->first;
+
+ // Skip over the IV or root instructions; only match their users.
+ bool Continue = false;
+ if (isBaseInst(BaseInst)) {
+ Visited.insert(BaseInst);
+ BaseIt = nextInstr(0, Uses, Visited);
+ Continue = true;
+ }
+ if (isRootInst(RootInst)) {
+ LastRootIt = RootIt;
+ Visited.insert(RootInst);
+ RootIt = nextInstr(Iter, Uses, Visited);
+ Continue = true;
+ }
+ if (Continue) continue;
+
+ if (!BaseInst->isSameOperationAs(RootInst)) {
+ // Last chance saloon. We don't try and solve the full isomorphism
+ // problem, but try and at least catch the case where two instructions
+ // *of different types* are round the wrong way. We won't be able to
+ // efficiently tell, given two ADD instructions, which way around we
+ // should match them, but given an ADD and a SUB, we can at least infer
+ // which one is which.
+ //
+ // This should allow us to deal with a greater subset of the isomorphism
+ // problem. It does however change a linear algorithm into a quadratic
+ // one, so limit the number of probes we do.
+ auto TryIt = RootIt;
+ unsigned N = NumToleratedFailedMatches;
+ while (TryIt != Uses.end() &&
+ !BaseInst->isSameOperationAs(TryIt->first) &&
+ N--) {
+ ++TryIt;
+ TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
}
- ++J2;
+ if (TryIt == Uses.end() || TryIt == RootIt ||
+ instrDependsOn(TryIt->first, RootIt, TryIt)) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << "\n");
+ return false;
+ }
+
+ RootIt = TryIt;
+ RootInst = TryIt->first;
}
- if (!J1->isSameOperationAs(J2)) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
- " vs. " << *J2 << "\n");
- MatchFailed = true;
- break;
+ // All instructions between the last root and this root
+ // may belong to some other iteration. If they belong to a
+ // future iteration, then they're dangerous to alias with.
+ //
+ // Note that because we allow a limited amount of flexibility in the order
+ // that we visit nodes, LastRootIt might be *before* RootIt, in which
+ // case we've already checked this set of instructions so we shouldn't
+ // do anything.
+ for (; LastRootIt < RootIt; ++LastRootIt) {
+ Instruction *I = LastRootIt->first;
+ if (LastRootIt->second.find_first() < (int)Iter)
+ continue;
+ if (I->mayWriteToMemory())
+ AST.add(I);
+ // Note: This is specifically guarded by a check on isa<PHINode>,
+ // which while a valid (somewhat arbitrary) micro-optimization, is
+ // needed because otherwise isSafeToSpeculativelyExecute returns
+ // false on PHI nodes.
+ if (!isa<PHINode>(I) && !isSimpleLoadStore(I) &&
+ !isSafeToSpeculativelyExecute(I, DL))
+ // Intervening instructions cause side effects.
+ FutureSideEffects = true;
}
// Make sure that this instruction, which is in the use set of this
// root instruction, does not also belong to the base set or the set of
- // some previous root instruction.
- if (BaseUseSet.count(J2) || AllRootUses.count(J2)) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
- " vs. " << *J2 << " (prev. case overlap)\n");
- MatchFailed = true;
- break;
+ // some other root instruction.
+ if (RootIt->second.count() > 1) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << " (prev. case overlap)\n");
+ return false;
}
// Make sure that we don't alias with any instruction in the alias set
// tracker. If we do, then we depend on a future iteration, and we
// can't reroll.
- if (J2->mayReadFromMemory()) {
- for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end();
- K != KE && !MatchFailed; ++K) {
- if (K->aliasesUnknownInst(J2, *AA)) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
- " vs. " << *J2 << " (depends on future store)\n");
- MatchFailed = true;
- break;
+ if (RootInst->mayReadFromMemory())
+ for (auto &K : AST) {
+ if (K.aliasesUnknownInst(RootInst, *AA)) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << " (depends on future store)\n");
+ return false;
}
}
- }
// If we've past an instruction from a future iteration that may have
// side effects, and this instruction might also, then we can't reorder
// them, and this matching fails. As an exception, we allow the alias
// set tracker to handle regular (simple) load/store dependencies.
if (FutureSideEffects &&
- ((!isSimpleLoadStore(J1) &&
- !isSafeToSpeculativelyExecute(J1, DL)) ||
- (!isSimpleLoadStore(J2) &&
- !isSafeToSpeculativelyExecute(J2, DL)))) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
- " vs. " << *J2 <<
+ ((!isSimpleLoadStore(BaseInst) &&
+ !isSafeToSpeculativelyExecute(BaseInst, DL)) ||
+ (!isSimpleLoadStore(RootInst) &&
+ !isSafeToSpeculativelyExecute(RootInst, DL)))) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst <<
" (side effects prevent reordering)\n");
- MatchFailed = true;
- break;
+ return false;
}
// For instructions that are part of a reduction, if the operation is
@@ -942,42 +1182,46 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
// x += a[i]; x += b[i];
// x += a[i+1]; x += b[i+1];
// x += b[i+2]; x += a[i+2];
- bool InReduction = Reductions.isPairInSame(J1, J2);
+ bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
- if (!(InReduction && J1->isAssociative())) {
+ if (!(InReduction && BaseInst->isAssociative())) {
bool Swapped = false, SomeOpMatched = false;
- for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
- Value *Op2 = J2->getOperand(j);
+ for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
+ Value *Op2 = RootInst->getOperand(j);
// If this is part of a reduction (and the operation is not
// associatve), then we match all operands, but not those that are
// part of the reduction.
if (InReduction)
if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
- if (Reductions.isPairInSame(J2, Op2I))
+ if (Reductions.isPairInSame(RootInst, Op2I))
continue;
DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
- if (BMI != BaseMap.end())
+ if (BMI != BaseMap.end()) {
Op2 = BMI->second;
- else if (std::find(Roots[i].begin(), Roots[i].end(),
- (Instruction*) Op2) != Roots[i].end())
- Op2 = IV;
+ } else {
+ for (auto &DRS : RootSets) {
+ if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
+ Op2 = DRS.BaseInst;
+ break;
+ }
+ }
+ }
- if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+ if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
// If we've not already decided to swap the matched operands, and
// we've not already matched our first operand (note that we could
// have skipped matching the first operand because it is part of a
// reduction above), and the instruction is commutative, then try
// the swapped match.
- if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
- J1->getOperand(!j) == Op2) {
+ if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
+ BaseInst->getOperand(!j) == Op2) {
Swapped = true;
} else {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
- " vs. " << *J2 << " (operand " << j << ")\n");
- MatchFailed = true;
- break;
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (operand " << j << ")\n");
+ return false;
}
}
@@ -985,81 +1229,41 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
}
}
- if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) ||
- (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) {
- DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
- " vs. " << *J2 << " (uses outside loop)\n");
- MatchFailed = true;
- break;
+ if ((!PossibleRedLastSet.count(BaseInst) &&
+ hasUsesOutsideLoop(BaseInst, L)) ||
+ (!PossibleRedLastSet.count(RootInst) &&
+ hasUsesOutsideLoop(RootInst, L))) {
+ DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+ " vs. " << *RootInst << " (uses outside loop)\n");
+ return false;
}
- if (!MatchFailed)
- BaseMap.insert(std::pair<Value *, Value *>(J2, J1));
-
- AllRootUses.insert(J2);
- Reductions.recordPair(J1, J2, i+1);
+ Reductions.recordPair(BaseInst, RootInst, Iter);
+ BaseMap.insert(std::make_pair(RootInst, BaseInst));
- ++J2;
+ LastRootIt = RootIt;
+ Visited.insert(BaseInst);
+ Visited.insert(RootInst);
+ BaseIt = nextInstr(0, Uses, Visited);
+ RootIt = nextInstr(Iter, Uses, Visited);
}
+ assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
+ "Mismatched set sizes!");
}
- if (MatchFailed)
- return false;
-
DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
- *RealIV << "\n");
-
- DenseSet<Instruction *> LoopIncUseSet;
- collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(),
- SmallInstructionSet(), LoopIncUseSet);
- DEBUG(dbgs() << "LRR: Loop increment set size: " <<
- LoopIncUseSet.size() << "\n");
-
- // Make sure that all instructions in the loop have been included in some
- // use set.
- for (BasicBlock::iterator J = Header->begin(), JE = Header->end();
- J != JE; ++J) {
- if (isa<DbgInfoIntrinsic>(J))
- continue;
- if (cast<Instruction>(J) == RealIV)
- continue;
- if (cast<Instruction>(J) == IV)
- continue;
- if (BaseUseSet.count(J) || AllRootUses.count(J) ||
- (LoopIncUseSet.count(J) && (J->isTerminator() ||
- isSafeToSpeculativelyExecute(J, DL))))
- continue;
-
- if (AllRoots.count(J))
- continue;
-
- if (Reductions.isSelectedPHI(J))
- continue;
+ *IV << "\n");
- DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV <<
- " unprocessed instruction found: " << *J << "\n");
- MatchFailed = true;
- break;
- }
-
- if (MatchFailed)
- return false;
-
- DEBUG(dbgs() << "LRR: all instructions processed from " <<
- *RealIV << "\n");
-
- if (!Reductions.validateSelected())
- return false;
-
- // At this point, we've validated the rerolling, and we're committed to
- // making changes!
-
- Reductions.replaceSelected();
+ return true;
+}
+void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
+ BasicBlock *Header = L->getHeader();
// Remove instructions associated with non-base iterations.
for (BasicBlock::reverse_iterator J = Header->rbegin();
J != Header->rend();) {
- if (AllRootUses.count(&*J)) {
+ unsigned I = Uses[&*J].find_first();
+ if (I > 0 && I < IL_All) {
Instruction *D = &*J;
DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
D->eraseFromParent();
@@ -1069,57 +1273,198 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
++J;
}
- // Insert the new induction variable.
- const SCEV *Start = RealIVSCEV->getStart();
- if (Inc == 1)
- Start = SE->getMulExpr(Start,
- SE->getConstant(Start->getType(), Scale));
- const SCEVAddRecExpr *H =
- cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start,
- SE->getConstant(RealIVSCEV->getType(), 1),
- L, SCEV::FlagAnyWrap));
- { // Limit the lifetime of SCEVExpander.
- SCEVExpander Expander(*SE, "reroll");
- Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin());
-
- for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(),
- JE = BaseUseSet.end(); J != JE; ++J)
- (*J)->replaceUsesOfWith(IV, NewIV);
-
- if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
- if (LoopIncUseSet.count(BI)) {
- const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
- if (Inc == 1)
- ICSCEV =
- SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale));
- // Iteration count SCEV minus 1
- const SCEV *ICMinus1SCEV =
- SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1));
-
- Value *ICMinus1; // Iteration count minus 1
- if (isa<SCEVConstant>(ICMinus1SCEV)) {
- ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
- } else {
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader)
- Preheader = InsertPreheaderForLoop(L, this);
-
- ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
- Preheader->getTerminator());
- }
+ // We need to create a new induction variable for each different BaseInst.
+ for (auto &DRS : RootSets) {
+ // Insert the new induction variable.
+ const SCEVAddRecExpr *RealIVSCEV =
+ cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+ const SCEV *Start = RealIVSCEV->getStart();
+ const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>
+ (SE->getAddRecExpr(Start,
+ SE->getConstant(RealIVSCEV->getType(), 1),
+ L, SCEV::FlagAnyWrap));
+ { // Limit the lifetime of SCEVExpander.
+ SCEVExpander Expander(*SE, "reroll");
+ Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin());
+
+ for (auto &KV : Uses) {
+ if (KV.second.find_first() == 0)
+ KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV);
+ }
- Value *Cond =
+ if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+ // FIXME: Why do we need this check?
+ if (Uses[BI].find_first() == IL_All) {
+ const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+
+ // Iteration count SCEV minus 1
+ const SCEV *ICMinus1SCEV =
+ SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1));
+
+ Value *ICMinus1; // Iteration count minus 1
+ if (isa<SCEVConstant>(ICMinus1SCEV)) {
+ ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
+ } else {
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ Preheader = InsertPreheaderForLoop(L, Parent);
+
+ ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
+ Preheader->getTerminator());
+ }
+
+ Value *Cond =
new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond");
- BI->setCondition(Cond);
+ BI->setCondition(Cond);
- if (BI->getSuccessor(1) != Header)
- BI->swapSuccessors();
+ if (BI->getSuccessor(1) != Header)
+ BI->swapSuccessors();
+ }
}
}
}
SimplifyInstructionsInBlock(Header, DL, TLI);
DeleteDeadPHIs(Header, TLI);
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+ // For a non-associative reduction, the chain entries must appear in order.
+ for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+ RI != RIE; ++RI) {
+ int i = *RI;
+ int PrevIter = 0, BaseCount = 0, Count = 0;
+ for (Instruction *J : PossibleReds[i]) {
+ // Note that all instructions in the chain must have been found because
+ // all instructions in the function must have been assigned to some
+ // iteration.
+ int Iter = PossibleRedIter[J];
+ if (Iter != PrevIter && Iter != PrevIter + 1 &&
+ !PossibleReds[i].getReducedValue()->isAssociative()) {
+ DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+ J << "\n");
+ return false;
+ }
+
+ if (Iter != PrevIter) {
+ if (Count != BaseCount) {
+ DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+ " reduction use count " << Count <<
+ " is not equal to the base use count " <<
+ BaseCount << "\n");
+ return false;
+ }
+
+ Count = 0;
+ }
+
+ ++Count;
+ if (Iter == 0)
+ ++BaseCount;
+
+ PrevIter = Iter;
+ }
+ }
+
+ return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+ // Fixup reductions to refer to the last instruction associated with the
+ // first iteration (not the last).
+ for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+ RI != RIE; ++RI) {
+ int i = *RI;
+ int j = 0;
+ for (int e = PossibleReds[i].size(); j != e; ++j)
+ if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+ --j;
+ break;
+ }
+
+ // Replace users with the new end-of-chain value.
+ SmallInstructionVector Users;
+ for (User *U : PossibleReds[i].getReducedValue()->users()) {
+ Users.push_back(cast<Instruction>(U));
+ }
+
+ for (SmallInstructionVector::iterator J = Users.begin(),
+ JE = Users.end(); J != JE; ++J)
+ (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+ PossibleReds[i][j]);
+ }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1 <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2 <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+// the next unmatched instruction in f(%iv.(i+1)).
+// - Ensure that both matched instructions don't have any external users
+// (with the exception of last-in-chain reduction instructions).
+// - Track the (aliasing) write set, and other side effects, of all
+// instructions that belong to future iterations that come before the matched
+// instructions. If the matched instructions read from that write set, then
+// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+// if any of these future instructions had side effects (could not be
+// speculatively executed), and so do the matched instructions, when we
+// cannot reorder those side-effect-producing instructions, and rerolling
+// fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+ const SCEV *IterCount,
+ ReductionTracker &Reductions) {
+ DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DL);
+
+ if (!DAGRoots.findRoots())
+ return false;
+ DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+ *IV << "\n");
+
+ if (!DAGRoots.validate(Reductions))
+ return false;
+ if (!Reductions.validateSelected())
+ return false;
+ // At this point, we've validated the rerolling, and we're committed to
+ // making changes!
+
+ Reductions.replaceSelected();
+ DAGRoots.replace(IterCount);
+
++NumRerolledLoops;
return true;
}
@@ -1129,9 +1474,9 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
return false;
AA = &getAnalysis<AliasAnalysis>();
- LI = &getAnalysis<LoopInfo>();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolution>();
- TLI = &getAnalysis<TargetLibraryInfo>();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index afd2eca..4d12349 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -13,7 +13,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopPass.h"
@@ -54,16 +54,16 @@ namespace {
// LCSSA form makes instruction renaming easier.
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionTracker>();
+ AU.addRequired<AssumptionCacheTracker>();
AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfo>();
- AU.addPreserved<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequiredID(LoopSimplifyID);
AU.addPreservedID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addPreservedID(LCSSAID);
AU.addPreserved<ScalarEvolution>();
- AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
}
bool runOnLoop(Loop *L, LPPassManager &LPM) override;
@@ -74,15 +74,16 @@ namespace {
unsigned MaxHeaderSize;
LoopInfo *LI;
const TargetTransformInfo *TTI;
- AssumptionTracker *AT;
+ AssumptionCache *AC;
+ DominatorTree *DT;
};
}
char LoopRotate::ID = 0;
INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
@@ -100,9 +101,13 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
// Save the loop metadata.
MDNode *LoopMD = L->getLoopID();
- LI = &getAnalysis<LoopInfo>();
- TTI = &getAnalysis<TargetTransformInfo>();
- AT = &getAnalysis<AssumptionTracker>();
+ Function &F = *L->getHeader()->getParent();
+
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
// Simplify the loop latch before attempting to rotate the header
// upward. Rotation may not be needed if the loop tail can be folded into the
@@ -225,20 +230,17 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr: {
- Value *IVOpnd = nullptr;
- if (isa<ConstantInt>(I->getOperand(0)))
- IVOpnd = I->getOperand(1);
-
- if (isa<ConstantInt>(I->getOperand(1))) {
- if (IVOpnd)
- return false;
-
- IVOpnd = I->getOperand(0);
- }
+ Value *IVOpnd = !isa<Constant>(I->getOperand(0))
+ ? I->getOperand(0)
+ : !isa<Constant>(I->getOperand(1))
+ ? I->getOperand(1)
+ : nullptr;
+ if (!IVOpnd)
+ return false;
// If increment operand is used outside of the loop, this speculation
// could cause extra live range interference.
- if (MultiExitLoop && IVOpnd) {
+ if (MultiExitLoop) {
for (User *UseI : IVOpnd->users()) {
auto *UserInst = cast<Instruction>(UseI);
if (!L->contains(UserInst))
@@ -307,9 +309,8 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
// Nuke the Latch block.
assert(Latch->empty() && "unable to evacuate Latch");
LI->removeBlock(Latch);
- if (DominatorTreeWrapperPass *DTWP =
- getAnalysisIfAvailable<DominatorTreeWrapperPass>())
- DTWP->getDomTree().eraseNode(Latch);
+ if (DT)
+ DT->eraseNode(Latch);
Latch->eraseFromParent();
return true;
}
@@ -356,7 +357,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// duplicate blocks inside it.
{
SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
CodeMetrics Metrics;
Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
@@ -441,7 +442,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// With the operands remapped, see if the instruction constant folds or is
// otherwise simplifyable. This commonly occurs because the entry from PHI
// nodes allows icmps and other instructions to fold.
- // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction.
+ // FIXME: Provide DL, TLI, DT, AC to SimplifyInstruction.
Value *V = SimplifyInstruction(C);
if (V && LI->replacementPreservesLCSSAForm(C, V)) {
// If so, then delete the temporary instruction and stick the folded value
@@ -494,31 +495,31 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// The conditional branch can't be folded, handle the general case.
// Update DominatorTree to reflect the CFG change we just made. Then split
// edges as necessary to preserve LoopSimplify form.
- if (DominatorTreeWrapperPass *DTWP =
- getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
- DominatorTree &DT = DTWP->getDomTree();
+ if (DT) {
// Everything that was dominated by the old loop header is now dominated
// by the original loop preheader. Conceptually the header was merged
// into the preheader, even though we reuse the actual block as a new
// loop latch.
- DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader);
+ DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
OrigHeaderNode->end());
- DomTreeNode *OrigPreheaderNode = DT.getNode(OrigPreheader);
+ DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
- DT.changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+ DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
- assert(DT.getNode(Exit)->getIDom() == OrigPreheaderNode);
- assert(DT.getNode(NewHeader)->getIDom() == OrigPreheaderNode);
+ assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+ assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
// Update OrigHeader to be dominated by the new header block.
- DT.changeImmediateDominator(OrigHeader, OrigLatch);
+ DT->changeImmediateDominator(OrigHeader, OrigLatch);
}
// Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
// thus is not a preheader anymore.
// Split the edge to form a real preheader.
- BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this);
+ BasicBlock *NewPH = SplitCriticalEdge(
+ OrigPreheader, NewHeader,
+ CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
NewPH->setName(NewHeader->getName() + ".lr.ph");
// Preserve canonical loop form, which means that 'Exit' should have only
@@ -534,8 +535,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
Loop *PredLoop = LI->getLoopFor(*PI);
if (!PredLoop || PredLoop->contains(Exit))
continue;
+ if (isa<IndirectBrInst>((*PI)->getTerminator()))
+ continue;
SplitLatchEdge |= L->getLoopLatch() == *PI;
- BasicBlock *ExitSplit = SplitCriticalEdge(*PI, Exit, this);
+ BasicBlock *ExitSplit = SplitCriticalEdge(
+ *PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
ExitSplit->moveBefore(Exit);
}
assert(SplitLatchEdge &&
@@ -549,17 +553,15 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
PHBI->eraseFromParent();
// With our CFG finalized, update DomTree if it is available.
- if (DominatorTreeWrapperPass *DTWP =
- getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
- DominatorTree &DT = DTWP->getDomTree();
+ if (DT) {
// Update OrigHeader to be dominated by the new header block.
- DT.changeImmediateDominator(NewHeader, OrigPreheader);
- DT.changeImmediateDominator(OrigHeader, OrigLatch);
+ DT->changeImmediateDominator(NewHeader, OrigPreheader);
+ DT->changeImmediateDominator(OrigHeader, OrigLatch);
// Brute force incremental dominator tree update. Call
// findNearestCommonDominator on all CFG predecessors of each child of the
// original header.
- DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader);
+ DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
OrigHeaderNode->end());
bool Changed;
@@ -572,11 +574,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
pred_iterator PI = pred_begin(BB);
BasicBlock *NearestDom = *PI;
for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
- NearestDom = DT.findNearestCommonDominator(NearestDom, *PI);
+ NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
// Remember if this changes the DomTree.
if (Node->getIDom()->getBlock() != NearestDom) {
- DT.changeImmediateDominator(BB, NearestDom);
+ DT->changeImmediateDominator(BB, NearestDom);
Changed = true;
}
}
@@ -594,7 +596,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// the OrigHeader block into OrigLatch. This will succeed if they are
// connected by an unconditional branch. This is just a cleanup so the
// emitted code isn't too gross in this common case.
- MergeBlockIntoPredecessor(OrigHeader, this);
+ MergeBlockIntoPredecessor(OrigHeader, DT, LI);
DEBUG(dbgs() << "LoopRotation: into "; L->dump());
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 7b60373..318065e 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1327,11 +1327,9 @@ void LSRUse::DeleteFormula(Formula &F) {
/// RecomputeRegs - Recompute the Regs field, and update RegUses.
void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
// Now that we've filtered out some formulae, recompute the Regs set.
- SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
+ SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
Regs.clear();
- for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
- E = Formulae.end(); I != E; ++I) {
- const Formula &F = *I;
+ for (const Formula &F : Formulae) {
if (F.ScaledReg) Regs.insert(F.ScaledReg);
Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
}
@@ -4728,12 +4726,14 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
// Split the critical edge.
BasicBlock *NewBB = nullptr;
if (!Parent->isLandingPad()) {
- NewBB = SplitCriticalEdge(BB, Parent, P,
- /*MergeIdenticalEdges=*/true,
- /*DontDeleteUselessPhis=*/true);
+ NewBB = SplitCriticalEdge(BB, Parent,
+ CriticalEdgeSplittingOptions(&DT, &LI)
+ .setMergeIdenticalEdges()
+ .setDontDeleteUselessPHIs());
} else {
SmallVector<BasicBlock*, 2> NewBBs;
- SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
+ SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs,
+ /*AliasAnalysis*/ nullptr, &DT, &LI);
NewBB = NewBBs[0];
}
// If NewBB==NULL, then SplitCriticalEdge refused to split because all
@@ -4863,9 +4863,10 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
LSRInstance::LSRInstance(Loop *L, Pass *P)
: IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()),
DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()),
- LI(P->getAnalysis<LoopInfo>()),
- TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false),
- IVIncInsertPos(nullptr) {
+ LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()),
+ TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent())),
+ L(L), Changed(false), IVIncInsertPos(nullptr) {
// If LoopSimplify form is not available, stay out of trouble.
if (!L->isLoopSimplifyForm())
return;
@@ -5041,11 +5042,11 @@ private:
char LoopStrengthReduce::ID = 0;
INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
"Loop Strength Reduction", false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_DEPENDENCY(IVUsers)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
"Loop Strength Reduction", false, false)
@@ -5064,8 +5065,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
// many analyses if they are around.
AU.addPreservedID(LoopSimplifyID);
- AU.addRequired<LoopInfo>();
- AU.addPreserved<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
@@ -5076,7 +5077,7 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequiredID(LoopSimplifyID);
AU.addRequired<IVUsers>();
AU.addPreserved<IVUsers>();
- AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
}
bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
@@ -5098,7 +5099,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
#endif
unsigned numFolded = Rewriter.replaceCongruentIVs(
L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
- &getAnalysis<TargetTransformInfo>());
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent()));
if (numFolded) {
Changed = true;
DeleteTriviallyDeadInstructions(DeadInsts);
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f60d990..924be16 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -13,11 +13,12 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/FunctionTargetTransformInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DiagnosticInfo.h"
@@ -28,6 +29,8 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Analysis/InstructionSimplify.h"
#include <climits>
using namespace llvm;
@@ -38,6 +41,22 @@ static cl::opt<unsigned>
UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden,
cl::desc("The cut-off point for automatic loop unrolling"));
+static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
+ "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden,
+ cl::desc("Don't allow loop unrolling to simulate more than this number of"
+ "iterations when checking full unroll profitability"));
+
+static cl::opt<unsigned> UnrollMinPercentOfOptimized(
+ "unroll-percent-of-optimized-for-complete-unroll", cl::init(20), cl::Hidden,
+ cl::desc("If complete unrolling could trigger further optimizations, and, "
+ "by that, remove the given percent of instructions, perform the "
+ "complete unroll even if it's beyond the threshold"));
+
+static cl::opt<unsigned> UnrollAbsoluteThreshold(
+ "unroll-absolute-threshold", cl::init(2000), cl::Hidden,
+ cl::desc("Don't unroll if the unrolled size is bigger than this threshold,"
+ " even if we can remove big portion of instructions later."));
+
static cl::opt<unsigned>
UnrollCount("unroll-count", cl::init(0), cl::Hidden,
cl::desc("Use this unroll count for all loops including those with "
@@ -63,11 +82,16 @@ namespace {
static char ID; // Pass ID, replacement for typeid
LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) {
CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T);
+ CurrentAbsoluteThreshold = UnrollAbsoluteThreshold;
+ CurrentMinPercentOfOptimized = UnrollMinPercentOfOptimized;
CurrentCount = (C == -1) ? UnrollCount : unsigned(C);
CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R;
UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
+ UserAbsoluteThreshold = (UnrollAbsoluteThreshold.getNumOccurrences() > 0);
+ UserPercentOfOptimized =
+ (UnrollMinPercentOfOptimized.getNumOccurrences() > 0);
UserAllowPartial = (P != -1) ||
(UnrollAllowPartial.getNumOccurrences() > 0);
UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0);
@@ -91,10 +115,16 @@ namespace {
unsigned CurrentCount;
unsigned CurrentThreshold;
+ unsigned CurrentAbsoluteThreshold;
+ unsigned CurrentMinPercentOfOptimized;
bool CurrentAllowPartial;
bool CurrentRuntime;
bool UserCount; // CurrentCount is user-specified.
bool UserThreshold; // CurrentThreshold is user-specified.
+ bool UserAbsoluteThreshold; // CurrentAbsoluteThreshold is
+ // user-specified.
+ bool UserPercentOfOptimized; // CurrentMinPercentOfOptimized is
+ // user-specified.
bool UserAllowPartial; // CurrentAllowPartial is user-specified.
bool UserRuntime; // CurrentRuntime is user-specified.
@@ -104,17 +134,16 @@ namespace {
/// loop preheaders be inserted into the CFG...
///
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionTracker>();
- AU.addRequired<LoopInfo>();
- AU.addPreserved<LoopInfo>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequiredID(LoopSimplifyID);
AU.addPreservedID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addPreservedID(LCSSAID);
AU.addRequired<ScalarEvolution>();
AU.addPreserved<ScalarEvolution>();
- AU.addRequired<TargetTransformInfo>();
- AU.addRequired<FunctionTargetTransformInfo>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
// FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
// If loop unroll does not preserve dom info then LCSSA pass on next
// loop will receive invalid dom info.
@@ -124,9 +153,11 @@ namespace {
// Fill in the UnrollingPreferences parameter with values from the
// TargetTransformationInfo.
- void getUnrollingPreferences(Loop *L, const FunctionTargetTransformInfo &FTTI,
+ void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI,
TargetTransformInfo::UnrollingPreferences &UP) {
UP.Threshold = CurrentThreshold;
+ UP.AbsoluteThreshold = CurrentAbsoluteThreshold;
+ UP.MinPercentOfOptimized = CurrentMinPercentOfOptimized;
UP.OptSizeThreshold = OptSizeUnrollThreshold;
UP.PartialThreshold = CurrentThreshold;
UP.PartialOptSizeThreshold = OptSizeUnrollThreshold;
@@ -134,7 +165,7 @@ namespace {
UP.MaxCount = UINT_MAX;
UP.Partial = CurrentAllowPartial;
UP.Runtime = CurrentRuntime;
- FTTI.getUnrollingPreferences(L, UP);
+ TTI.getUnrollingPreferences(L, UP);
}
// Select and return an unroll count based on parameters from
@@ -153,18 +184,37 @@ namespace {
// unrolled loops respectively.
void selectThresholds(const Loop *L, bool HasPragma,
const TargetTransformInfo::UnrollingPreferences &UP,
- unsigned &Threshold, unsigned &PartialThreshold) {
+ unsigned &Threshold, unsigned &PartialThreshold,
+ unsigned NumberOfOptimizedInstructions) {
// Determine the current unrolling threshold. While this is
// normally set from UnrollThreshold, it is overridden to a
// smaller value if the current function is marked as
// optimize-for-size, and the unroll threshold was not user
// specified.
Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
+
+ // If we are allowed to completely unroll if we can remove M% of
+ // instructions, and we know that with complete unrolling we'll be able
+ // to kill N instructions, then we can afford to completely unroll loops
+ // with unrolled size up to N*100/M.
+ // Adjust the threshold according to that:
+ unsigned PercentOfOptimizedForCompleteUnroll =
+ UserPercentOfOptimized ? CurrentMinPercentOfOptimized
+ : UP.MinPercentOfOptimized;
+ unsigned AbsoluteThreshold = UserAbsoluteThreshold
+ ? CurrentAbsoluteThreshold
+ : UP.AbsoluteThreshold;
+ if (PercentOfOptimizedForCompleteUnroll)
+ Threshold = std::max<unsigned>(Threshold,
+ NumberOfOptimizedInstructions * 100 /
+ PercentOfOptimizedForCompleteUnroll);
+ // But don't allow unrolling loops bigger than absolute threshold.
+ Threshold = std::min<unsigned>(Threshold, AbsoluteThreshold);
+
PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold;
if (!UserThreshold &&
- L->getHeader()->getParent()->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex,
- Attribute::OptimizeForSize)) {
+ L->getHeader()->getParent()->hasFnAttribute(
+ Attribute::OptimizeForSize)) {
Threshold = UP.OptSizeThreshold;
PartialThreshold = UP.PartialOptSizeThreshold;
}
@@ -185,10 +235,9 @@ namespace {
char LoopUnroll::ID = 0;
INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_DEPENDENCY(FunctionTargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
@@ -203,13 +252,333 @@ Pass *llvm::createSimpleLoopUnrollPass() {
return llvm::createLoopUnrollPass(-1, -1, 0, 0);
}
+static bool isLoadFromConstantInitializer(Value *V) {
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+ if (GV->isConstant() && GV->hasDefinitiveInitializer())
+ return GV->getInitializer();
+ return false;
+}
+
+struct FindConstantPointers {
+ bool LoadCanBeConstantFolded;
+ bool IndexIsConstant;
+ APInt Step;
+ APInt StartValue;
+ Value *BaseAddress;
+ const Loop *L;
+ ScalarEvolution &SE;
+ FindConstantPointers(const Loop *loop, ScalarEvolution &SE)
+ : LoadCanBeConstantFolded(true), IndexIsConstant(true), L(loop), SE(SE) {}
+
+ bool follow(const SCEV *S) {
+ if (const SCEVUnknown *SC = dyn_cast<SCEVUnknown>(S)) {
+ // We've reached the leaf node of SCEV, it's most probably just a
+ // variable. Now it's time to see if it corresponds to a global constant
+ // global (in which case we can eliminate the load), or not.
+ BaseAddress = SC->getValue();
+ LoadCanBeConstantFolded =
+ IndexIsConstant && isLoadFromConstantInitializer(BaseAddress);
+ return false;
+ }
+ if (isa<SCEVConstant>(S))
+ return true;
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ // If the current SCEV expression is AddRec, and its loop isn't the loop
+ // we are about to unroll, then we won't get a constant address after
+ // unrolling, and thus, won't be able to eliminate the load.
+ if (AR->getLoop() != L)
+ return IndexIsConstant = false;
+ // If the step isn't constant, we won't get constant addresses in unrolled
+ // version. Bail out.
+ if (const SCEVConstant *StepSE =
+ dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
+ Step = StepSE->getValue()->getValue();
+ else
+ return IndexIsConstant = false;
+
+ return IndexIsConstant;
+ }
+ // If Result is true, continue traversal.
+ // Otherwise, we have found something that prevents us from (possible) load
+ // elimination.
+ return IndexIsConstant;
+ }
+ bool isDone() const { return !IndexIsConstant; }
+};
+
+// This class is used to get an estimate of the optimization effects that we
+// could get from complete loop unrolling. It comes from the fact that some
+// loads might be replaced with concrete constant values and that could trigger
+// a chain of instruction simplifications.
+//
+// E.g. we might have:
+// int a[] = {0, 1, 0};
+// v = 0;
+// for (i = 0; i < 3; i ++)
+// v += b[i]*a[i];
+// If we completely unroll the loop, we would get:
+// v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2]
+// Which then will be simplified to:
+// v = b[0]* 0 + b[1]* 1 + b[2]* 0
+// And finally:
+// v = b[1]
+class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> {
+ typedef InstVisitor<UnrollAnalyzer, bool> Base;
+ friend class InstVisitor<UnrollAnalyzer, bool>;
+
+ const Loop *L;
+ unsigned TripCount;
+ ScalarEvolution &SE;
+ const TargetTransformInfo &TTI;
+
+ DenseMap<Value *, Constant *> SimplifiedValues;
+ DenseMap<LoadInst *, Value *> LoadBaseAddresses;
+ SmallPtrSet<Instruction *, 32> CountedInstructions;
+
+ /// \brief Count the number of optimized instructions.
+ unsigned NumberOfOptimizedInstructions;
+
+ // Provide base case for our instruction visit.
+ bool visitInstruction(Instruction &I) { return false; };
+ // TODO: We should also visit ICmp, FCmp, GetElementPtr, Trunc, ZExt, SExt,
+ // FPTrunc, FPExt, FPToUI, FPToSI, UIToFP, SIToFP, BitCast, Select,
+ // ExtractElement, InsertElement, ShuffleVector, ExtractValue, InsertValue.
+ //
+ // Probaly it's worth to hoist the code for estimating the simplifications
+ // effects to a separate class, since we have a very similar code in
+ // InlineCost already.
+ bool visitBinaryOperator(BinaryOperator &I) {
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+ if (!isa<Constant>(LHS))
+ if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+ LHS = SimpleLHS;
+ if (!isa<Constant>(RHS))
+ if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+ RHS = SimpleRHS;
+ Value *SimpleV = nullptr;
+ if (auto FI = dyn_cast<FPMathOperator>(&I))
+ SimpleV =
+ SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags());
+ else
+ SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS);
+
+ if (SimpleV && CountedInstructions.insert(&I).second)
+ NumberOfOptimizedInstructions += TTI.getUserCost(&I);
+
+ if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
+ SimplifiedValues[&I] = C;
+ return true;
+ }
+ return false;
+ }
+
+ Constant *computeLoadValue(LoadInst *LI, unsigned Iteration) {
+ if (!LI)
+ return nullptr;
+ Value *BaseAddr = LoadBaseAddresses[LI];
+ if (!BaseAddr)
+ return nullptr;
+
+ auto GV = dyn_cast<GlobalVariable>(BaseAddr);
+ if (!GV)
+ return nullptr;
+
+ ConstantDataSequential *CDS =
+ dyn_cast<ConstantDataSequential>(GV->getInitializer());
+ if (!CDS)
+ return nullptr;
+
+ const SCEV *BaseAddrSE = SE.getSCEV(BaseAddr);
+ const SCEV *S = SE.getSCEV(LI->getPointerOperand());
+ const SCEV *OffSE = SE.getMinusSCEV(S, BaseAddrSE);
+
+ APInt StepC, StartC;
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffSE);
+ if (!AR)
+ return nullptr;
+
+ if (const SCEVConstant *StepSE =
+ dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
+ StepC = StepSE->getValue()->getValue();
+ else
+ return nullptr;
+
+ if (const SCEVConstant *StartSE = dyn_cast<SCEVConstant>(AR->getStart()))
+ StartC = StartSE->getValue()->getValue();
+ else
+ return nullptr;
+
+ unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
+ unsigned Start = StartC.getLimitedValue();
+ unsigned Step = StepC.getLimitedValue();
+
+ unsigned Index = (Start + Step * Iteration) / ElemSize;
+ if (Index >= CDS->getNumElements())
+ return nullptr;
+
+ Constant *CV = CDS->getElementAsConstant(Index);
+
+ return CV;
+ }
+
+public:
+ UnrollAnalyzer(const Loop *L, unsigned TripCount, ScalarEvolution &SE,
+ const TargetTransformInfo &TTI)
+ : L(L), TripCount(TripCount), SE(SE), TTI(TTI),
+ NumberOfOptimizedInstructions(0) {}
+
+ // Visit all loads the loop L, and for those that, after complete loop
+ // unrolling, would have a constant address and it will point to a known
+ // constant initializer, record its base address for future use. It is used
+ // when we estimate number of potentially simplified instructions.
+ void findConstFoldableLoads() {
+ for (auto BB : L->getBlocks()) {
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (!LI->isSimple())
+ continue;
+ Value *AddrOp = LI->getPointerOperand();
+ const SCEV *S = SE.getSCEV(AddrOp);
+ FindConstantPointers Visitor(L, SE);
+ SCEVTraversal<FindConstantPointers> T(Visitor);
+ T.visitAll(S);
+ if (Visitor.IndexIsConstant && Visitor.LoadCanBeConstantFolded) {
+ LoadBaseAddresses[LI] = Visitor.BaseAddress;
+ }
+ }
+ }
+ }
+ }
+
+ // Given a list of loads that could be constant-folded (LoadBaseAddresses),
+ // estimate number of optimized instructions after substituting the concrete
+ // values for the given Iteration. Also track how many instructions become
+ // dead through this process.
+ unsigned estimateNumberOfOptimizedInstructions(unsigned Iteration) {
+ // We keep a set vector for the worklist so that we don't wast space in the
+ // worklist queuing up the same instruction repeatedly. This can happen due
+ // to multiple operands being the same instruction or due to the same
+ // instruction being an operand of lots of things that end up dead or
+ // simplified.
+ SmallSetVector<Instruction *, 8> Worklist;
+
+ // Clear the simplified values and counts for this iteration.
+ SimplifiedValues.clear();
+ CountedInstructions.clear();
+ NumberOfOptimizedInstructions = 0;
+
+ // We start by adding all loads to the worklist.
+ for (auto &LoadDescr : LoadBaseAddresses) {
+ LoadInst *LI = LoadDescr.first;
+ SimplifiedValues[LI] = computeLoadValue(LI, Iteration);
+ if (CountedInstructions.insert(LI).second)
+ NumberOfOptimizedInstructions += TTI.getUserCost(LI);
+
+ for (User *U : LI->users())
+ Worklist.insert(cast<Instruction>(U));
+ }
+
+ // And then we try to simplify every user of every instruction from the
+ // worklist. If we do simplify a user, add it to the worklist to process
+ // its users as well.
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+ if (!L->contains(I))
+ continue;
+ if (!visit(I))
+ continue;
+ for (User *U : I->users())
+ Worklist.insert(cast<Instruction>(U));
+ }
+
+ // Now that we know the potentially simplifed instructions, estimate number
+ // of instructions that would become dead if we do perform the
+ // simplification.
+
+ // The dead instructions are held in a separate set. This is used to
+ // prevent us from re-examining instructions and make sure we only count
+ // the benifit once. The worklist's internal set handles insertion
+ // deduplication.
+ SmallPtrSet<Instruction *, 16> DeadInstructions;
+
+ // Lambda to enque operands onto the worklist.
+ auto EnqueueOperands = [&](Instruction &I) {
+ for (auto *Op : I.operand_values())
+ if (auto *OpI = dyn_cast<Instruction>(Op))
+ if (!OpI->use_empty())
+ Worklist.insert(OpI);
+ };
+
+ // Start by initializing worklist with simplified instructions.
+ for (auto &FoldedKeyValue : SimplifiedValues)
+ if (auto *FoldedInst = dyn_cast<Instruction>(FoldedKeyValue.first)) {
+ DeadInstructions.insert(FoldedInst);
+
+ // Add each instruction operand of this dead instruction to the
+ // worklist.
+ EnqueueOperands(*FoldedInst);
+ }
+
+ // If a definition of an insn is only used by simplified or dead
+ // instructions, it's also dead. Check defs of all instructions from the
+ // worklist.
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+ if (!L->contains(I))
+ continue;
+ if (DeadInstructions.count(I))
+ continue;
+
+ if (std::all_of(I->user_begin(), I->user_end(), [&](User *U) {
+ return DeadInstructions.count(cast<Instruction>(U));
+ })) {
+ NumberOfOptimizedInstructions += TTI.getUserCost(I);
+ DeadInstructions.insert(I);
+ EnqueueOperands(*I);
+ }
+ }
+ return NumberOfOptimizedInstructions;
+ }
+};
+
+// Complete loop unrolling can make some loads constant, and we need to know if
+// that would expose any further optimization opportunities.
+// This routine estimates this optimization effect and returns the number of
+// instructions, that potentially might be optimized away.
+static unsigned
+approximateNumberOfOptimizedInstructions(const Loop *L, ScalarEvolution &SE,
+ unsigned TripCount,
+ const TargetTransformInfo &TTI) {
+ if (!TripCount || !UnrollMaxIterationsCountToAnalyze)
+ return 0;
+
+ UnrollAnalyzer UA(L, TripCount, SE, TTI);
+ UA.findConstFoldableLoads();
+
+ // Estimate number of instructions, that could be simplified if we replace a
+ // load with the corresponding constant. Since the same load will take
+ // different values on different iterations, we have to go through all loop's
+ // iterations here. To limit ourselves here, we check only first N
+ // iterations, and then scale the found number, if necessary.
+ unsigned IterationsNumberForEstimate =
+ std::min<unsigned>(UnrollMaxIterationsCountToAnalyze, TripCount);
+ unsigned NumberOfOptimizedInstructions = 0;
+ for (unsigned i = 0; i < IterationsNumberForEstimate; ++i)
+ NumberOfOptimizedInstructions +=
+ UA.estimateNumberOfOptimizedInstructions(i);
+
+ NumberOfOptimizedInstructions *= TripCount / IterationsNumberForEstimate;
+
+ return NumberOfOptimizedInstructions;
+}
+
/// ApproximateLoopSize - Approximate the size of the loop.
static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
bool &NotDuplicatable,
const TargetTransformInfo &TTI,
- AssumptionTracker *AT) {
+ AssumptionCache *AC) {
SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
CodeMetrics Metrics;
for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
@@ -222,8 +591,11 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
// Don't allow an estimate of size zero. This would allows unrolling of loops
// with huge iteration counts, which is a compile time problem even if it's
- // not a problem for code quality.
- if (LoopSize == 0) LoopSize = 1;
+ // not a problem for code quality. Also, the code using this size may assume
+ // that each loop has at least three instructions (likely a conditional
+ // branch, a comparison feeding that branch, and some kind of loop increment
+ // feeding that comparison instruction).
+ LoopSize = std::max(LoopSize, 3u);
return LoopSize;
}
@@ -231,48 +603,31 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
// Returns the loop hint metadata node with the given name (for example,
// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
// returned.
-static const MDNode *GetUnrollMetadata(const Loop *L, StringRef Name) {
- MDNode *LoopID = L->getLoopID();
- if (!LoopID)
- return nullptr;
-
- // First operand should refer to the loop id itself.
- assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
- assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
- for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
- const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
- if (!MD)
- continue;
-
- const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
- if (!S)
- continue;
-
- if (Name.equals(S->getString()))
- return MD;
- }
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+ if (MDNode *LoopID = L->getLoopID())
+ return GetUnrollMetadata(LoopID, Name);
return nullptr;
}
// Returns true if the loop has an unroll(full) pragma.
static bool HasUnrollFullPragma(const Loop *L) {
- return GetUnrollMetadata(L, "llvm.loop.unroll.full");
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
}
// Returns true if the loop has an unroll(disable) pragma.
static bool HasUnrollDisablePragma(const Loop *L) {
- return GetUnrollMetadata(L, "llvm.loop.unroll.disable");
+ return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
}
// If loop has an unroll_count pragma return the (necessarily
// positive) value from the pragma. Otherwise return 0.
static unsigned UnrollCountPragmaValue(const Loop *L) {
- const MDNode *MD = GetUnrollMetadata(L, "llvm.loop.unroll.count");
+ MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
if (MD) {
assert(MD->getNumOperands() == 2 &&
"Unroll count hint metadata should have two operands.");
- unsigned Count = cast<ConstantInt>(MD->getOperand(1))->getZExtValue();
+ unsigned Count =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
assert(Count >= 1 && "Unroll count must be positive.");
return Count;
}
@@ -288,9 +643,9 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
if (!LoopID) return;
// First remove any existing loop unrolling metadata.
- SmallVector<Value *, 4> Vals;
+ SmallVector<Metadata *, 4> MDs;
// Reserve first location for self reference to the LoopID metadata node.
- Vals.push_back(nullptr);
+ MDs.push_back(nullptr);
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
bool IsUnrollMetadata = false;
MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
@@ -298,17 +653,18 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
}
- if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i));
+ if (!IsUnrollMetadata)
+ MDs.push_back(LoopID->getOperand(i));
}
// Add unroll(disable) metadata to disable future unrolling.
LLVMContext &Context = L->getHeader()->getContext();
- SmallVector<Value *, 1> DisableOperands;
+ SmallVector<Metadata *, 1> DisableOperands;
DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
MDNode *DisableNode = MDNode::get(Context, DisableOperands);
- Vals.push_back(DisableNode);
+ MDs.push_back(DisableNode);
- MDNode *NewLoopID = MDNode::get(Context, Vals);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
// Set operand 0 to refer to the loop id itself.
NewLoopID->replaceOperandWith(0, NewLoopID);
L->setLoopID(NewLoopID);
@@ -358,12 +714,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
if (skipOptnoneFunction(L))
return false;
- LoopInfo *LI = &getAnalysis<LoopInfo>();
+ Function &F = *L->getHeader()->getParent();
+
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
- const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
- const FunctionTargetTransformInfo &FTTI =
- getAnalysis<FunctionTargetTransformInfo>();
- AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
BasicBlock *Header = L->getHeader();
DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
@@ -377,7 +734,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
bool HasPragma = PragmaFullUnroll || PragmaCount > 0;
TargetTransformInfo::UnrollingPreferences UP;
- getUnrollingPreferences(L, FTTI, UP);
+ getUnrollingPreferences(L, TTI, UP);
// Find trip count and trip multiple if count is not available
unsigned TripCount = 0;
@@ -402,9 +759,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
unsigned NumInlineCandidates;
bool notDuplicatable;
unsigned LoopSize =
- ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, AT);
+ ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC);
DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
- uint64_t UnrolledSize = (uint64_t)LoopSize * Count;
+
+ // When computing the unrolled size, note that the conditional branch on the
+ // backedge and the comparison feeding it are not replicated like the rest of
+ // the loop body (which is why 2 is subtracted).
+ uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2;
if (notDuplicatable) {
DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
<< " instructions.\n");
@@ -415,8 +776,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
return false;
}
+ unsigned NumberOfOptimizedInstructions =
+ approximateNumberOfOptimizedInstructions(L, *SE, TripCount, TTI);
+ DEBUG(dbgs() << " Complete unrolling could save: "
+ << NumberOfOptimizedInstructions << "\n");
+
unsigned Threshold, PartialThreshold;
- selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold);
+ selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold,
+ NumberOfOptimizedInstructions);
// Given Count, TripCount and thresholds determine the type of
// unrolling which is to be performed.
@@ -449,7 +816,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
}
if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) {
// Reduce unroll count to be modulo of TripCount for partial unrolling.
- Count = PartialThreshold / LoopSize;
+ Count = (std::max(PartialThreshold, 3u)-2) / (LoopSize-2);
while (Count != 0 && TripCount % Count != 0)
Count--;
}
@@ -463,7 +830,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
// the original count which satisfies the threshold limit.
while (Count != 0 && UnrolledSize > PartialThreshold) {
Count >>= 1;
- UnrolledSize = LoopSize * Count;
+ UnrolledSize = (LoopSize-2) * Count + 2;
}
if (Count > UP.MaxCount)
Count = UP.MaxCount;
@@ -509,7 +876,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
// Unroll the loop.
if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this,
- &LPM, AT))
+ &LPM, &AC))
return false;
return true;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index ef43483..987dc96 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -30,7 +30,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -105,7 +105,7 @@ namespace {
// Analyze loop. Check its size, calculate is it possible to unswitch
// it. Returns true if we can unswitch this loop.
bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
- AssumptionTracker *AT);
+ AssumptionCache *AC);
// Clean all data related to given loop.
void forgetLoop(const Loop *L);
@@ -128,7 +128,7 @@ namespace {
class LoopUnswitch : public LoopPass {
LoopInfo *LI; // Loop information
LPPassManager *LPM;
- AssumptionTracker *AT;
+ AssumptionCache *AC;
// LoopProcessWorklist - Used to check if second loop needs processing
// after RewriteLoopBodyWithConditionConstant rewrites first loop.
@@ -167,16 +167,16 @@ namespace {
/// loop preheaders be inserted into the CFG.
///
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionTracker>();
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequiredID(LoopSimplifyID);
AU.addPreservedID(LoopSimplifyID);
- AU.addRequired<LoopInfo>();
- AU.addPreserved<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequiredID(LCSSAID);
AU.addPreservedID(LCSSAID);
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<ScalarEvolution>();
- AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
}
private:
@@ -217,7 +217,7 @@ namespace {
// Analyze loop. Check its size, calculate is it possible to unswitch
// it. Returns true if we can unswitch this loop.
bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
- AssumptionTracker *AT) {
+ AssumptionCache *AC) {
LoopPropsMapIt PropsIt;
bool Inserted;
@@ -235,7 +235,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
// This is a very ad-hoc heuristic.
SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
// FIXME: This is overly conservative because it does not take into
// consideration code simplification opportunities and code that can
@@ -333,10 +333,10 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
char LoopUnswitch::ID = 0;
INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
false, false)
@@ -385,8 +385,9 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
if (skipOptnoneFunction(L))
return false;
- AT = &getAnalysis<AssumptionTracker>();
- LI = &getAnalysis<LoopInfo>();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
LPM = &LPM_Ref;
DominatorTreeWrapperPass *DTWP =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
@@ -431,8 +432,10 @@ bool LoopUnswitch::processCurrentLoop() {
// Probably we reach the quota of branches for this loop. If so
// stop unswitching.
- if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>(),
- AT))
+ if (!BranchesInfo.countLoop(
+ currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *currentLoop->getHeader()->getParent()),
+ AC))
return false;
// Loop over all of the basic blocks in the loop. If we find an interior
@@ -654,9 +657,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
// Check to see if it would be profitable to unswitch current loop.
// Do not do non-trivial unswitch while optimizing for size.
- if (OptimizeForSize ||
- F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::OptimizeForSize))
+ if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize))
return false;
UnswitchNontrivialCondition(LoopCond, Val, currentLoop);
@@ -674,7 +675,7 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
I != E; ++I)
if (LI->getLoopFor(*I) == L)
- New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase());
+ New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
// Add all of the subloops to the new loop.
for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
@@ -705,8 +706,9 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
// If either edge is critical, split it. This helps preserve LoopSimplify
// form for enclosing loops.
- SplitCriticalEdge(BI, 0, this, false, false, true);
- SplitCriticalEdge(BI, 1, this, false, false, true);
+ auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA();
+ SplitCriticalEdge(BI, 0, Options);
+ SplitCriticalEdge(BI, 1, Options);
}
/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
@@ -725,7 +727,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
// First step, split the preheader, so that we know that there is a safe place
// to insert the conditional branch. We will change loopPreheader to have a
// conditional branch on Cond.
- BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this);
+ BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI);
// Now that we have a place to insert the conditional branch, create a place
// to branch to: this is the exit block out of the loop that we should
@@ -736,7 +738,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
// without actually branching to it (the exit block should be dominated by the
// loop header, not the preheader).
assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
- BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this);
+ BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI);
// Okay, now we have a position to branch from and a position to branch to,
// insert the new conditional branch.
@@ -767,13 +769,9 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
// Although SplitBlockPredecessors doesn't preserve loop-simplify in
// general, if we call it on all predecessors of all exits then it does.
- if (!ExitBlock->isLandingPad()) {
- SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", this);
- } else {
- SmallVector<BasicBlock*, 2> NewBBs;
- SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa",
- this, NewBBs);
- }
+ SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa",
+ /*AliasAnalysis*/ nullptr, DT, LI,
+ /*PreserveLCSSA*/ true);
}
}
@@ -796,7 +794,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
// First step, split the preheader and exit blocks, and add these blocks to
// the LoopBlocks list.
- BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this);
+ BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, DT, LI);
LoopBlocks.push_back(NewPreheader);
// We want the loop to come after the preheader, but before the exit blocks.
@@ -836,7 +834,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
// FIXME: We could register any cloned assumptions instead of clearing the
// whole function's cache.
- AT->forgetCachedAssumptions(F);
+ AC->clear();
// Now we create the new Loop object for the versioned loop.
Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
@@ -849,14 +847,14 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
if (ParentLoop) {
// Make sure to add the cloned preheader and exit blocks to the parent loop
// as well.
- ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase());
+ ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
}
for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
// The new exit block should be in the same loop as the old one.
if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
- ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase());
+ ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
"Exit block should have been split to have one successor!");
@@ -1042,7 +1040,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
// and hooked up so as to preserve the loop structure, because
// trying to update it is complicated. So instead we preserve the
// loop structure and put the block on a dead code path.
- SplitEdge(Switch, SISucc, this);
+ SplitEdge(Switch, SISucc, DT, LI);
// Compute the successors instead of relying on the return value
// of SplitEdge, since it may have split the switch successor
// after PHI nodes.
diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
new file mode 100644
index 0000000..0c47cbd
--- /dev/null
+++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -0,0 +1,192 @@
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-expect-intrinsic"
+
+STATISTIC(ExpectIntrinsicsHandled,
+ "Number of 'expect' intrinsic instructions handled");
+
+static cl::opt<uint32_t>
+LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
+ cl::desc("Weight of the branch likely to be taken (default = 64)"));
+static cl::opt<uint32_t>
+UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4),
+ cl::desc("Weight of the branch unlikely to be taken (default = 4)"));
+
+static bool handleSwitchExpect(SwitchInst &SI) {
+ CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
+ if (!CI)
+ return false;
+
+ Function *Fn = CI->getCalledFunction();
+ if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+ return false;
+
+ Value *ArgValue = CI->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!ExpectedValue)
+ return false;
+
+ SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue);
+ unsigned n = SI.getNumCases(); // +1 for default case.
+ SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
+
+ if (Case == SI.case_default())
+ Weights[0] = LikelyBranchWeight;
+ else
+ Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
+
+ SI.setMetadata(LLVMContext::MD_prof,
+ MDBuilder(CI->getContext()).createBranchWeights(Weights));
+
+ SI.setCondition(ArgValue);
+ return true;
+}
+
+static bool handleBranchExpect(BranchInst &BI) {
+ if (BI.isUnconditional())
+ return false;
+
+ // Handle non-optimized IR code like:
+ // %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
+ // %tobool = icmp ne i64 %expval, 0
+ // br i1 %tobool, label %if.then, label %if.end
+ //
+ // Or the following simpler case:
+ // %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
+ // br i1 %expval, label %if.then, label %if.end
+
+ CallInst *CI;
+
+ ICmpInst *CmpI = dyn_cast<ICmpInst>(BI.getCondition());
+ if (!CmpI) {
+ CI = dyn_cast<CallInst>(BI.getCondition());
+ } else {
+ if (CmpI->getPredicate() != CmpInst::ICMP_NE)
+ return false;
+ CI = dyn_cast<CallInst>(CmpI->getOperand(0));
+ }
+
+ if (!CI)
+ return false;
+
+ Function *Fn = CI->getCalledFunction();
+ if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+ return false;
+
+ Value *ArgValue = CI->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!ExpectedValue)
+ return false;
+
+ MDBuilder MDB(CI->getContext());
+ MDNode *Node;
+
+ // If expect value is equal to 1 it means that we are more likely to take
+ // branch 0, in other case more likely is branch 1.
+ if (ExpectedValue->isOne())
+ Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
+ else
+ Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+
+ BI.setMetadata(LLVMContext::MD_prof, Node);
+
+ if (CmpI)
+ CmpI->setOperand(0, ArgValue);
+ else
+ BI.setCondition(ArgValue);
+ return true;
+}
+
+static bool lowerExpectIntrinsic(Function &F) {
+ bool Changed = false;
+
+ for (BasicBlock &BB : F) {
+ // Create "block_weights" metadata.
+ if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+ if (handleBranchExpect(*BI))
+ ExpectIntrinsicsHandled++;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+ if (handleSwitchExpect(*SI))
+ ExpectIntrinsicsHandled++;
+ }
+
+ // remove llvm.expect intrinsics.
+ for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+ CallInst *CI = dyn_cast<CallInst>(BI++);
+ if (!CI)
+ continue;
+
+ Function *Fn = CI->getCalledFunction();
+ if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+ Value *Exp = CI->getArgOperand(0);
+ CI->replaceAllUsesWith(Exp);
+ CI->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F) {
+ if (lowerExpectIntrinsic(F))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+/// \brief Legacy pass for lowering expect intrinsics out of the IR.
+///
+/// When this pass is run over a function it uses expect intrinsics which feed
+/// branches and switches to provide branch weight metadata for those
+/// terminators. It then removes the expect intrinsics from the IR so the rest
+/// of the optimizer can ignore them.
+class LowerExpectIntrinsic : public FunctionPass {
+public:
+ static char ID;
+ LowerExpectIntrinsic() : FunctionPass(ID) {
+ initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
+};
+}
+
+char LowerExpectIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
+ "Lower 'expect' Intrinsics", false, false)
+
+FunctionPass *llvm::createLowerExpectIntrinsicPass() {
+ return new LowerExpectIntrinsic();
+}
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index be524be..006b885 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -16,7 +16,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
@@ -28,7 +28,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include <list>
using namespace llvm;
@@ -330,11 +330,11 @@ namespace {
// This transformation requires dominator postdominator info
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<AssumptionTracker>();
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<MemoryDependenceAnalysis>();
AU.addRequired<AliasAnalysis>();
- AU.addRequired<TargetLibraryInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addPreserved<AliasAnalysis>();
AU.addPreserved<MemoryDependenceAnalysis>();
}
@@ -363,10 +363,10 @@ FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
false, false)
@@ -750,6 +750,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
// its dependence information by changing its parameter.
MD->removeInstruction(C);
+ // Update AA metadata
+ // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
+ // handled here, but combineMetadata doesn't support them yet
+ unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ };
+ combineMetadata(C, cpy, KnownIDs);
+
// Remove the memcpy.
MD->removeInstruction(cpy);
++NumMemCpyInstr;
@@ -982,11 +992,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
// If it is greater than the memcpy, then we check to see if we can force the
// source of the memcpy to the alignment we need. If we fail, we bail out.
- AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *CS->getParent()->getParent());
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
if (MDep->getAlignment() < ByValAlign &&
- getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign,
- DL, AT, CS.getInstruction(), &DT) < ByValAlign)
+ getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &AC,
+ CS.getInstruction(), &DT) < ByValAlign)
return false;
// Verify that the copied-from memory doesn't change in between the memcpy and
@@ -1067,7 +1079,7 @@ bool MemCpyOpt::runOnFunction(Function &F) {
MD = &getAnalysis<MemoryDependenceAnalysis>();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
- TLI = &getAnalysis<TargetLibraryInfo>();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
// If we don't have at least memset and memcpy, there is little point of doing
// anything here. These are required by a freestanding implementation, so if
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 8281c59..8fad63f 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -86,7 +86,7 @@
#include "llvm/Support/Allocator.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <vector>
@@ -115,7 +115,7 @@ public:
private:
// This transformation requires dominator postdominator info
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<MemoryDependenceAnalysis>();
AU.addRequired<AliasAnalysis>();
AU.addPreserved<AliasAnalysis>();
@@ -143,7 +143,9 @@ private:
// Routines for sinking stores
StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
- bool isStoreSinkBarrier(Instruction *Inst);
+ bool isStoreSinkBarrierInRange(const Instruction& Start,
+ const Instruction& End,
+ AliasAnalysis::Location Loc);
bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
bool mergeStores(BasicBlock *BB);
// The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
@@ -166,7 +168,7 @@ FunctionPass *llvm::createMergedLoadStoreMotionPass() {
INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion",
"MergedLoadStoreMotion", false, false)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion",
"MergedLoadStoreMotion", false, false)
@@ -239,7 +241,7 @@ bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start,
const Instruction& End,
LoadInst* LI) {
AliasAnalysis::Location Loc = AA->getLocation(LI);
- return AA->canInstructionRangeModify(Start, End, Loc);
+ return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod);
}
///
@@ -389,26 +391,19 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
}
///
-/// \brief True when instruction is sink barrier for a store
-///
-bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) {
- // FIXME: Conservatively let a load instruction block the store.
- // Use alias analysis instead.
- if (isa<LoadInst>(Inst))
- return true;
- if (isa<CallInst>(Inst))
- return true;
- if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst))
- return true;
- // Note: mayHaveSideEffects covers all instructions that could
- // trigger a change to state. Eg. in-flight stores have to be executed
- // before ordered loads or fences, calls could invoke functions that store
- // data to memory etc.
- if (!isa<StoreInst>(Inst) && Inst->mayHaveSideEffects()) {
- return true;
- }
- DEBUG(dbgs() << "No Sink Barrier\n");
- return false;
+/// \brief True when instruction is a sink barrier for a store
+/// located in Loc
+///
+/// Whenever an instruction could possibly read or modify the
+/// value being stored or protect against the store from
+/// happening it is considered a sink barrier.
+///
+
+bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction& Start,
+ const Instruction& End,
+ AliasAnalysis::Location
+ Loc) {
+ return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef);
}
///
@@ -416,27 +411,30 @@ bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) {
///
/// \return The store in \p when it is safe to sink. Otherwise return Null.
///
-StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB,
- StoreInst *SI) {
- StoreInst *I = 0;
- DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n");
- for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend();
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
+ StoreInst *Store0) {
+ DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+ BasicBlock *BB0 = Store0->getParent();
+ for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend();
RBI != RBE; ++RBI) {
Instruction *Inst = &*RBI;
- // Only move loads if they are used in the block.
- if (isStoreSinkBarrier(Inst))
- break;
- if (isa<StoreInst>(Inst)) {
- AliasAnalysis::Location LocSI = AA->getLocation(SI);
- AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst);
- if (AA->isMustAlias(LocSI, LocInst)) {
- I = (StoreInst *)Inst;
- break;
- }
+ if (!isa<StoreInst>(Inst))
+ continue;
+
+ StoreInst *Store1 = cast<StoreInst>(Inst);
+
+ AliasAnalysis::Location Loc0 = AA->getLocation(Store0);
+ AliasAnalysis::Location Loc1 = AA->getLocation(Store1);
+ if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
+ !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),
+ BB1->back(), Loc1) &&
+ !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))),
+ BB0->back(), Loc0)) {
+ return Store1;
}
}
- return I;
+ return nullptr;
}
///
@@ -548,8 +546,7 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
Instruction *I = &*RBI;
++RBI;
- if (isStoreSinkBarrier(I))
- break;
+
// Sink move non-simple (atomic, volatile) stores
if (!isa<StoreInst>(I))
continue;
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 5c8bed5..31d7df3 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -18,7 +18,7 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -52,16 +52,18 @@ INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls",
"Partially inline calls to library functions", false, false)
void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<TargetLibraryInfo>();
- AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
bool PartiallyInlineLibCalls::runOnFunction(Function &F) {
bool Changed = false;
Function::iterator CurrBB;
- TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
- const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>();
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
CurrBB = BB++;
@@ -126,7 +128,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
// Move all instructions following Call to newly created block JoinBB.
// Create phi and replace all uses.
- BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this);
+ BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
IRBuilder<> Builder(JoinBB, JoinBB->begin());
PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
Call->replaceAllUsesWith(Phi);
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
new file mode 100644
index 0000000..944725a
--- /dev/null
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -0,0 +1,989 @@
+//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Place garbage collection safepoints at appropriate locations in the IR. This
+// does not make relocation semantics or variable liveness explicit. That's
+// done by RewriteStatepointsForGC.
+//
+// Terminology:
+// - A call is said to be "parseable" if there is a stack map generated for the
+// return PC of the call. A runtime can determine where values listed in the
+// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located
+// on the stack when the code is suspended inside such a call. Every parse
+// point is represented by a call wrapped in an gc.statepoint intrinsic.
+// - A "poll" is an explicit check in the generated code to determine if the
+// runtime needs the generated code to cooperate by calling a helper routine
+// and thus suspending its execution at a known state. The call to the helper
+// routine will be parseable. The (gc & runtime specific) logic of a poll is
+// assumed to be provided in a function of the name "gc.safepoint_poll".
+//
+// We aim to insert polls such that running code can quickly be brought to a
+// well defined state for inspection by the collector. In the current
+// implementation, this is done via the insertion of poll sites at method entry
+// and the backedge of most loops. We try to avoid inserting more polls than
+// are neccessary to ensure a finite period between poll sites. This is not
+// because the poll itself is expensive in the generated code; it's not. Polls
+// do tend to impact the optimizer itself in negative ways; we'd like to avoid
+// perturbing the optimization of the method as much as we can.
+//
+// We also need to make most call sites parseable. The callee might execute a
+// poll (or otherwise be inspected by the GC). If so, the entire stack
+// (including the suspended frame of the current method) must be parseable.
+//
+// This pass will insert:
+// - Call parse points ("call safepoints") for any call which may need to
+// reach a safepoint during the execution of the callee function.
+// - Backedge safepoint polls and entry safepoint polls to ensure that
+// executing code reaches a safepoint poll in a finite amount of time.
+//
+// We do not currently support return statepoints, but adding them would not
+// be hard. They are not required for correctness - entry safepoints are an
+// alternative - but some GCs may prefer them. Patches welcome.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "safepoint-placement"
+STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
+STATISTIC(NumCallSafepoints, "Number of call safepoints inserted");
+STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
+
+STATISTIC(CallInLoop, "Number of loops w/o safepoints due to calls in loop");
+STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution");
+
+using namespace llvm;
+
+// Ignore oppurtunities to avoid placing safepoints on backedges, useful for
+// validation
+static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
+ cl::init(false));
+
+/// If true, do not place backedge safepoints in counted loops.
+static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true));
+
+// If true, split the backedge of a loop when placing the safepoint, otherwise
+// split the latch block itself. Both are useful to support for
+// experimentation, but in practice, it looks like splitting the backedge
+// optimizes better.
+static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
+ cl::init(false));
+
+// Print tracing output
+static cl::opt<bool> TraceLSP("spp-trace", cl::Hidden, cl::init(false));
+
+namespace {
+
+/** An analysis pass whose purpose is to identify each of the backedges in
+ the function which require a safepoint poll to be inserted. */
+struct PlaceBackedgeSafepointsImpl : public LoopPass {
+ static char ID;
+
+ /// The output of the pass - gives a list of each backedge (described by
+ /// pointing at the branch) which need a poll inserted.
+ std::vector<TerminatorInst *> PollLocations;
+
+ /// True unless we're running spp-no-calls in which case we need to disable
+ /// the call dependend placement opts.
+ bool CallSafepointsEnabled;
+ PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
+ : LoopPass(ID), CallSafepointsEnabled(CallSafepoints) {
+ initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *, LPPassManager &LPM) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // needed for determining if the loop is finite
+ AU.addRequired<ScalarEvolution>();
+ // to ensure each edge has a single backedge
+ // TODO: is this still required?
+ AU.addRequiredID(LoopSimplifyID);
+
+ // We no longer modify the IR at all in this pass. Thus all
+ // analysis are preserved.
+ AU.setPreservesAll();
+ }
+};
+}
+
+static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false));
+
+namespace {
+struct PlaceSafepoints : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+
+ PlaceSafepoints() : ModulePass(ID) {
+ initializePlaceSafepointsPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnModule(Module &M) override {
+ bool modified = false;
+ for (Function &F : M) {
+ modified |= runOnFunction(F);
+ }
+ return modified;
+ }
+ bool runOnFunction(Function &F);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We modify the graph wholesale (inlining, block insertion, etc). We
+ // preserve nothing at the moment. We could potentially preserve dom tree
+ // if that was worth doing
+ }
+};
+}
+
+// Insert a safepoint poll immediately before the given instruction. Does
+// not handle the parsability of state at the runtime call, that's the
+// callers job.
+static void
+InsertSafepointPoll(DominatorTree &DT, Instruction *after,
+ std::vector<CallSite> &ParsePointsNeeded /*rval*/);
+
+static bool isGCLeafFunction(const CallSite &CS);
+
+static bool needsStatepoint(const CallSite &CS) {
+ if (isGCLeafFunction(CS))
+ return false;
+ if (CS.isCall()) {
+ CallInst *call = cast<CallInst>(CS.getInstruction());
+ if (call->isInlineAsm())
+ return false;
+ }
+ if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) {
+ return false;
+ }
+ return true;
+}
+
+static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P);
+
+/// Returns true if this loop is known to contain a call safepoint which
+/// must unconditionally execute on any iteration of the loop which returns
+/// to the loop header via an edge from Pred. Returns a conservative correct
+/// answer; i.e. false is always valid.
+static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
+ BasicBlock *Pred,
+ DominatorTree &DT) {
+ // In general, we're looking for any cut of the graph which ensures
+ // there's a call safepoint along every edge between Header and Pred.
+ // For the moment, we look only for the 'cuts' that consist of a single call
+ // instruction in a block which is dominated by the Header and dominates the
+ // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain
+ // of such dominating blocks gets substaintially more occurences than just
+ // checking the Pred and Header blocks themselves. This may be due to the
+ // density of loop exit conditions caused by range and null checks.
+ // TODO: structure this as an analysis pass, cache the result for subloops,
+ // avoid dom tree recalculations
+ assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?");
+
+ BasicBlock *Current = Pred;
+ while (true) {
+ for (Instruction &I : *Current) {
+ if (CallSite CS = &I)
+ // Note: Technically, needing a safepoint isn't quite the right
+ // condition here. We should instead be checking if the target method
+ // has an
+ // unconditional poll. In practice, this is only a theoretical concern
+ // since we don't have any methods with conditional-only safepoint
+ // polls.
+ if (needsStatepoint(CS))
+ return true;
+ }
+
+ if (Current == Header)
+ break;
+ Current = DT.getNode(Current)->getIDom()->getBlock();
+ }
+
+ return false;
+}
+
+/// Returns true if this loop is known to terminate in a finite number of
+/// iterations. Note that this function may return false for a loop which
+/// does actual terminate in a finite constant number of iterations due to
+/// conservatism in the analysis.
+static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
+ BasicBlock *Pred) {
+ // Only used when SkipCounted is off
+ const unsigned upperTripBound = 8192;
+
+ // A conservative bound on the loop as a whole.
+ const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
+ if (MaxTrips != SE->getCouldNotCompute()) {
+ if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound))
+ return true;
+ if (SkipCounted &&
+ SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32))
+ return true;
+ }
+
+ // If this is a conditional branch to the header with the alternate path
+ // being outside the loop, we can ask questions about the execution frequency
+ // of the exit block.
+ if (L->isLoopExiting(Pred)) {
+ // This returns an exact expression only. TODO: We really only need an
+ // upper bound here, but SE doesn't expose that.
+ const SCEV *MaxExec = SE->getExitCount(L, Pred);
+ if (MaxExec != SE->getCouldNotCompute()) {
+ if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound))
+ return true;
+ if (SkipCounted &&
+ SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32))
+ return true;
+ }
+ }
+
+ return /* not finite */ false;
+}
+
+static void scanOneBB(Instruction *start, Instruction *end,
+ std::vector<CallInst *> &calls,
+ std::set<BasicBlock *> &seen,
+ std::vector<BasicBlock *> &worklist) {
+ for (BasicBlock::iterator itr(start);
+ itr != start->getParent()->end() && itr != BasicBlock::iterator(end);
+ itr++) {
+ if (CallInst *CI = dyn_cast<CallInst>(&*itr)) {
+ calls.push_back(CI);
+ }
+ // FIXME: This code does not handle invokes
+ assert(!dyn_cast<InvokeInst>(&*itr) &&
+ "support for invokes in poll code needed");
+ // Only add the successor blocks if we reach the terminator instruction
+ // without encountering end first
+ if (itr->isTerminator()) {
+ BasicBlock *BB = itr->getParent();
+ for (BasicBlock *Succ : successors(BB)) {
+ if (seen.count(Succ) == 0) {
+ worklist.push_back(Succ);
+ seen.insert(Succ);
+ }
+ }
+ }
+ }
+}
+static void scanInlinedCode(Instruction *start, Instruction *end,
+ std::vector<CallInst *> &calls,
+ std::set<BasicBlock *> &seen) {
+ calls.clear();
+ std::vector<BasicBlock *> worklist;
+ seen.insert(start->getParent());
+ scanOneBB(start, end, calls, seen, worklist);
+ while (!worklist.empty()) {
+ BasicBlock *BB = worklist.back();
+ worklist.pop_back();
+ scanOneBB(&*BB->begin(), end, calls, seen, worklist);
+ }
+}
+
+bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L, LPPassManager &LPM) {
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+
+ // Loop through all predecessors of the loop header and identify all
+ // backedges. We need to place a safepoint on every backedge (potentially).
+ // Note: Due to LoopSimplify there should only be one. Assert? Or can we
+ // relax this?
+ BasicBlock *header = L->getHeader();
+
+ // TODO: Use the analysis pass infrastructure for this. There is no reason
+ // to recalculate this here.
+ DominatorTree DT;
+ DT.recalculate(*header->getParent());
+
+ bool modified = false;
+ for (BasicBlock *pred : predecessors(header)) {
+ if (!L->contains(pred)) {
+ // This is not a backedge, it's coming from outside the loop
+ continue;
+ }
+
+ // Make a policy decision about whether this loop needs a safepoint or
+ // not. Note that this is about unburdening the optimizer in loops, not
+ // avoiding the runtime cost of the actual safepoint.
+ if (!AllBackedges) {
+ if (mustBeFiniteCountedLoop(L, SE, pred)) {
+ if (TraceLSP)
+ errs() << "skipping safepoint placement in finite loop\n";
+ FiniteExecution++;
+ continue;
+ }
+ if (CallSafepointsEnabled &&
+ containsUnconditionalCallSafepoint(L, header, pred, DT)) {
+ // Note: This is only semantically legal since we won't do any further
+ // IPO or inlining before the actual call insertion.. If we hadn't, we
+ // might latter loose this call safepoint.
+ if (TraceLSP)
+ errs() << "skipping safepoint placement due to unconditional call\n";
+ CallInLoop++;
+ continue;
+ }
+ }
+
+ // TODO: We can create an inner loop which runs a finite number of
+ // iterations with an outer loop which contains a safepoint. This would
+ // not help runtime performance that much, but it might help our ability to
+ // optimize the inner loop.
+
+ // We're unconditionally going to modify this loop.
+ modified = true;
+
+ // Safepoint insertion would involve creating a new basic block (as the
+ // target of the current backedge) which does the safepoint (of all live
+ // variables) and branches to the true header
+ TerminatorInst *term = pred->getTerminator();
+
+ if (TraceLSP) {
+ errs() << "[LSP] terminator instruction: ";
+ term->dump();
+ }
+
+ PollLocations.push_back(term);
+ }
+
+ return modified;
+}
+
+static Instruction *findLocationForEntrySafepoint(Function &F,
+ DominatorTree &DT) {
+
+ // Conceptually, this poll needs to be on method entry, but in
+ // practice, we place it as late in the entry block as possible. We
+ // can place it as late as we want as long as it dominates all calls
+ // that can grow the stack. This, combined with backedge polls,
+ // give us all the progress guarantees we need.
+
+ // Due to the way the frontend generates IR, we may have a couple of initial
+ // basic blocks before the first bytecode. These will be single-entry
+ // single-exit blocks which conceptually are just part of the first 'real
+ // basic block'. Since we don't have deopt state until the first bytecode,
+ // walk forward until we've found the first unconditional branch or merge.
+
+ // hasNextInstruction and nextInstruction are used to iterate
+ // through a "straight line" execution sequence.
+
+ auto hasNextInstruction = [](Instruction *I) {
+ if (!I->isTerminator()) {
+ return true;
+ }
+ BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
+ return nextBB && (nextBB->getUniquePredecessor() != nullptr);
+ };
+
+ auto nextInstruction = [&hasNextInstruction](Instruction *I) {
+ assert(hasNextInstruction(I) &&
+ "first check if there is a next instruction!");
+ if (I->isTerminator()) {
+ return I->getParent()->getUniqueSuccessor()->begin();
+ } else {
+ return std::next(BasicBlock::iterator(I));
+ }
+ };
+
+ Instruction *cursor = nullptr;
+ for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor);
+ cursor = nextInstruction(cursor)) {
+
+ // We need to stop going forward as soon as we see a call that can
+ // grow the stack (i.e. the call target has a non-zero frame
+ // size).
+ if (CallSite CS = cursor) {
+ (void)CS; // Silence an unused variable warning by gcc 4.8.2
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(cursor)) {
+ // llvm.assume(...) are not really calls.
+ if (II->getIntrinsicID() == Intrinsic::assume) {
+ continue;
+ }
+ }
+ break;
+ }
+ }
+
+ assert((hasNextInstruction(cursor) || cursor->isTerminator()) &&
+ "either we stopped because of a call, or because of terminator");
+
+ if (cursor->isTerminator()) {
+ return cursor;
+ }
+
+ BasicBlock *BB = cursor->getParent();
+ SplitBlock(BB, cursor, nullptr);
+
+ // Note: SplitBlock modifies the DT. Simply passing a Pass (which is a
+ // module pass) is not enough.
+ DT.recalculate(F);
+#ifndef NDEBUG
+ // SplitBlock updates the DT
+ DT.verifyDomTree();
+#endif
+
+ return BB->getTerminator();
+}
+
+/// Identify the list of call sites which need to be have parseable state
+static void findCallSafepoints(Function &F,
+ std::vector<CallSite> &Found /*rval*/) {
+ assert(Found.empty() && "must be empty!");
+ for (Instruction &I : inst_range(F)) {
+ Instruction *inst = &I;
+ if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) {
+ CallSite CS(inst);
+
+ // No safepoint needed or wanted
+ if (!needsStatepoint(CS)) {
+ continue;
+ }
+
+ Found.push_back(CS);
+ }
+ }
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector. Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(std::vector<T> &vec) {
+ std::set<T> seen;
+ std::vector<T> tmp;
+ vec.reserve(vec.size());
+ std::swap(tmp, vec);
+ for (auto V : tmp) {
+ if (seen.insert(V).second) {
+ vec.push_back(V);
+ }
+ }
+}
+
+static std::string GCSafepointPollName("gc.safepoint_poll");
+
+static bool isGCSafepointPoll(Function &F) {
+ return F.getName().equals(GCSafepointPollName);
+}
+
+/// Returns true if this function should be rewritten to include safepoint
+/// polls and parseable call sites. The main point of this function is to be
+/// an extension point for custom logic.
+static bool shouldRewriteFunction(Function &F) {
+ // TODO: This should check the GCStrategy
+ if (F.hasGC()) {
+ const std::string StatepointExampleName("statepoint-example");
+ return StatepointExampleName == F.getGC();
+ } else
+ return false;
+}
+
+// TODO: These should become properties of the GCStrategy, possibly with
+// command line overrides.
+static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
+static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
+static bool enableCallSafepoints(Function &F) { return !NoCall; }
+
+
+bool PlaceSafepoints::runOnFunction(Function &F) {
+ if (F.isDeclaration() || F.empty()) {
+ // This is a declaration, nothing to do. Must exit early to avoid crash in
+ // dom tree calculation
+ return false;
+ }
+
+ if (isGCSafepointPoll(F)) {
+ // Given we're inlining this inside of safepoint poll insertion, this
+ // doesn't make any sense. Note that we do make any contained calls
+ // parseable after we inline a poll.
+ return false;
+ }
+
+ if (!shouldRewriteFunction(F))
+ return false;
+
+ bool modified = false;
+
+ // In various bits below, we rely on the fact that uses are reachable from
+ // defs. When there are basic blocks unreachable from the entry, dominance
+ // and reachablity queries return non-sensical results. Thus, we preprocess
+ // the function to ensure these properties hold.
+ modified |= removeUnreachableBlocks(F);
+
+ // STEP 1 - Insert the safepoint polling locations. We do not need to
+ // actually insert parse points yet. That will be done for all polls and
+ // calls in a single pass.
+
+ // Note: With the migration, we need to recompute this for each 'pass'. Once
+ // we merge these, we'll do it once before the analysis
+ DominatorTree DT;
+
+ std::vector<CallSite> ParsePointNeeded;
+
+ if (enableBackedgeSafepoints(F)) {
+ // Construct a pass manager to run the LoopPass backedge logic. We
+ // need the pass manager to handle scheduling all the loop passes
+ // appropriately. Doing this by hand is painful and just not worth messing
+ // with for the moment.
+ legacy::FunctionPassManager FPM(F.getParent());
+ bool CanAssumeCallSafepoints = enableCallSafepoints(F);
+ PlaceBackedgeSafepointsImpl *PBS =
+ new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
+ FPM.add(PBS);
+ // Note: While the analysis pass itself won't modify the IR, LoopSimplify
+ // (which it depends on) may. i.e. analysis must be recalculated after run
+ FPM.run(F);
+
+ // We preserve dominance information when inserting the poll, otherwise
+ // we'd have to recalculate this on every insert
+ DT.recalculate(F);
+
+ // Insert a poll at each point the analysis pass identified
+ for (size_t i = 0; i < PBS->PollLocations.size(); i++) {
+ // We are inserting a poll, the function is modified
+ modified = true;
+
+ // The poll location must be the terminator of a loop latch block.
+ TerminatorInst *Term = PBS->PollLocations[i];
+
+ std::vector<CallSite> ParsePoints;
+ if (SplitBackedge) {
+ // Split the backedge of the loop and insert the poll within that new
+ // basic block. This creates a loop with two latches per original
+ // latch (which is non-ideal), but this appears to be easier to
+ // optimize in practice than inserting the poll immediately before the
+ // latch test.
+
+ // Since this is a latch, at least one of the successors must dominate
+ // it. Its possible that we have a) duplicate edges to the same header
+ // and b) edges to distinct loop headers. We need to insert pools on
+ // each. (Note: This still relies on LoopSimplify.)
+ DenseSet<BasicBlock *> Headers;
+ for (unsigned i = 0; i < Term->getNumSuccessors(); i++) {
+ BasicBlock *Succ = Term->getSuccessor(i);
+ if (DT.dominates(Succ, Term->getParent())) {
+ Headers.insert(Succ);
+ }
+ }
+ assert(!Headers.empty() && "poll location is not a loop latch?");
+
+ // The split loop structure here is so that we only need to recalculate
+ // the dominator tree once. Alternatively, we could just keep it up to
+ // date and use a more natural merged loop.
+ DenseSet<BasicBlock *> SplitBackedges;
+ for (BasicBlock *Header : Headers) {
+ BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, nullptr);
+ SplitBackedges.insert(NewBB);
+ }
+ DT.recalculate(F);
+ for (BasicBlock *NewBB : SplitBackedges) {
+ InsertSafepointPoll(DT, NewBB->getTerminator(), ParsePoints);
+ NumBackedgeSafepoints++;
+ }
+
+ } else {
+ // Split the latch block itself, right before the terminator.
+ InsertSafepointPoll(DT, Term, ParsePoints);
+ NumBackedgeSafepoints++;
+ }
+
+ // Record the parse points for later use
+ ParsePointNeeded.insert(ParsePointNeeded.end(), ParsePoints.begin(),
+ ParsePoints.end());
+ }
+ }
+
+ if (enableEntrySafepoints(F)) {
+ DT.recalculate(F);
+ Instruction *term = findLocationForEntrySafepoint(F, DT);
+ if (!term) {
+ // policy choice not to insert?
+ } else {
+ std::vector<CallSite> RuntimeCalls;
+ InsertSafepointPoll(DT, term, RuntimeCalls);
+ modified = true;
+ NumEntrySafepoints++;
+ ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
+ RuntimeCalls.end());
+ }
+ }
+
+ if (enableCallSafepoints(F)) {
+ DT.recalculate(F);
+ std::vector<CallSite> Calls;
+ findCallSafepoints(F, Calls);
+ NumCallSafepoints += Calls.size();
+ ParsePointNeeded.insert(ParsePointNeeded.end(), Calls.begin(), Calls.end());
+ }
+
+ // Unique the vectors since we can end up with duplicates if we scan the call
+ // site for call safepoints after we add it for entry or backedge. The
+ // only reason we need tracking at all is that some functions might have
+ // polls but not call safepoints and thus we might miss marking the runtime
+ // calls for the polls. (This is useful in test cases!)
+ unique_unsorted(ParsePointNeeded);
+
+ // Any parse point (no matter what source) will be handled here
+ DT.recalculate(F); // Needed?
+
+ // We're about to start modifying the function
+ if (!ParsePointNeeded.empty())
+ modified = true;
+
+ // Now run through and insert the safepoints, but do _NOT_ update or remove
+ // any existing uses. We have references to live variables that need to
+ // survive to the last iteration of this loop.
+ std::vector<Value *> Results;
+ Results.reserve(ParsePointNeeded.size());
+ for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
+ CallSite &CS = ParsePointNeeded[i];
+ Value *GCResult = ReplaceWithStatepoint(CS, nullptr);
+ Results.push_back(GCResult);
+ }
+ assert(Results.size() == ParsePointNeeded.size());
+
+ // Adjust all users of the old call sites to use the new ones instead
+ for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
+ CallSite &CS = ParsePointNeeded[i];
+ Value *GCResult = Results[i];
+ if (GCResult) {
+ // In case if we inserted result in a different basic block than the
+ // original safepoint (this can happen for invokes). We need to be sure
+ // that
+ // original result value was not used in any of the phi nodes at the
+ // beginning of basic block with gc result. Because we know that all such
+ // blocks will have single predecessor we can safely assume that all phi
+ // nodes have single entry (because of normalizeBBForInvokeSafepoint).
+ // Just remove them all here.
+ if (CS.isInvoke()) {
+ FoldSingleEntryPHINodes(cast<Instruction>(GCResult)->getParent(),
+ nullptr);
+ assert(
+ !isa<PHINode>(cast<Instruction>(GCResult)->getParent()->begin()));
+ }
+
+ // Replace all uses with the new call
+ CS.getInstruction()->replaceAllUsesWith(GCResult);
+ }
+
+ // Now that we've handled all uses, remove the original call itself
+ // Note: The insert point can't be the deleted instruction!
+ CS.getInstruction()->eraseFromParent();
+ }
+ return modified;
+}
+
+char PlaceBackedgeSafepointsImpl::ID = 0;
+char PlaceSafepoints::ID = 0;
+
+ModulePass *llvm::createPlaceSafepointsPass() { return new PlaceSafepoints(); }
+
+INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
+ "place-backedge-safepoints-impl",
+ "Place Backedge Safepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
+ "place-backedge-safepoints-impl",
+ "Place Backedge Safepoints", false, false)
+
+INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+ false, false)
+INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+ false, false)
+
+static bool isGCLeafFunction(const CallSite &CS) {
+ Instruction *inst = CS.getInstruction();
+ if (isa<IntrinsicInst>(inst)) {
+ // Most LLVM intrinsics are things which can never take a safepoint.
+ // As a result, we don't need to have the stack parsable at the
+ // callsite. This is a highly useful optimization since intrinsic
+ // calls are fairly prevelent, particularly in debug builds.
+ return true;
+ }
+
+ // If this function is marked explicitly as a leaf call, we don't need to
+ // place a safepoint of it. In fact, for correctness we *can't* in many
+ // cases. Note: Indirect calls return Null for the called function,
+ // these obviously aren't runtime functions with attributes
+ // TODO: Support attributes on the call site as well.
+ const Function *F = CS.getCalledFunction();
+ bool isLeaf =
+ F &&
+ F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true");
+ if (isLeaf) {
+ return true;
+ }
+ return false;
+}
+
+static void
+InsertSafepointPoll(DominatorTree &DT, Instruction *term,
+ std::vector<CallSite> &ParsePointsNeeded /*rval*/) {
+ Module *M = term->getParent()->getParent()->getParent();
+ assert(M);
+
+ // Inline the safepoint poll implementation - this will get all the branch,
+ // control flow, etc.. Most importantly, it will introduce the actual slow
+ // path call - where we need to insert a safepoint (parsepoint).
+ FunctionType *ftype =
+ FunctionType::get(Type::getVoidTy(M->getContext()), false);
+ assert(ftype && "null?");
+ // Note: This cast can fail if there's a function of the same name with a
+ // different type inserted previously
+ Function *F =
+ dyn_cast<Function>(M->getOrInsertFunction("gc.safepoint_poll", ftype));
+ assert(F && "void @gc.safepoint_poll() must be defined");
+ assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
+ CallInst *poll = CallInst::Create(F, "", term);
+
+ // Record some information about the call site we're replacing
+ BasicBlock *OrigBB = term->getParent();
+ BasicBlock::iterator before(poll), after(poll);
+ bool isBegin(false);
+ if (before == term->getParent()->begin()) {
+ isBegin = true;
+ } else {
+ before--;
+ }
+ after++;
+ assert(after != poll->getParent()->end() && "must have successor");
+ assert(DT.dominates(before, after) && "trivially true");
+
+ // do the actual inlining
+ InlineFunctionInfo IFI;
+ bool inlineStatus = InlineFunction(poll, IFI);
+ assert(inlineStatus && "inline must succeed");
+ (void)inlineStatus; // suppress warning in release-asserts
+
+ // Check post conditions
+ assert(IFI.StaticAllocas.empty() && "can't have allocs");
+
+ std::vector<CallInst *> calls; // new calls
+ std::set<BasicBlock *> BBs; // new BBs + insertee
+ // Include only the newly inserted instructions, Note: begin may not be valid
+ // if we inserted to the beginning of the basic block
+ BasicBlock::iterator start;
+ if (isBegin) {
+ start = OrigBB->begin();
+ } else {
+ start = before;
+ start++;
+ }
+
+ // If your poll function includes an unreachable at the end, that's not
+ // valid. Bugpoint likes to create this, so check for it.
+ assert(isPotentiallyReachable(&*start, &*after, nullptr, nullptr) &&
+ "malformed poll function");
+
+ scanInlinedCode(&*(start), &*(after), calls, BBs);
+
+ // Recompute since we've invalidated cached data. Conceptually we
+ // shouldn't need to do this, but implementation wise we appear to. Needed
+ // so we can insert safepoints correctly.
+ // TODO: update more cheaply
+ DT.recalculate(*after->getParent()->getParent());
+
+ assert(!calls.empty() && "slow path not found for safepoint poll");
+
+ // Record the fact we need a parsable state at the runtime call contained in
+ // the poll function. This is required so that the runtime knows how to
+ // parse the last frame when we actually take the safepoint (i.e. execute
+ // the slow path)
+ assert(ParsePointsNeeded.empty());
+ for (size_t i = 0; i < calls.size(); i++) {
+
+ // No safepoint needed or wanted
+ if (!needsStatepoint(calls[i])) {
+ continue;
+ }
+
+ // These are likely runtime calls. Should we assert that via calling
+ // convention or something?
+ ParsePointsNeeded.push_back(CallSite(calls[i]));
+ }
+ assert(ParsePointsNeeded.size() <= calls.size());
+}
+
+// Normalize basic block to make it ready to be target of invoke statepoint.
+// It means spliting it to have single predecessor. Return newly created BB
+// ready to be successor of invoke statepoint.
+static BasicBlock *normalizeBBForInvokeSafepoint(BasicBlock *BB,
+ BasicBlock *InvokeParent) {
+ BasicBlock *ret = BB;
+
+ if (!BB->getUniquePredecessor()) {
+ ret = SplitBlockPredecessors(BB, InvokeParent, "");
+ }
+
+ // Another requirement for such basic blocks is to not have any phi nodes.
+ // Since we just ensured that new BB will have single predecessor,
+ // all phi nodes in it will have one value. Here it would be naturall place
+ // to
+ // remove them all. But we can not do this because we are risking to remove
+ // one of the values stored in liveset of another statepoint. We will do it
+ // later after placing all safepoints.
+
+ return ret;
+}
+
+/// Replaces the given call site (Call or Invoke) with a gc.statepoint
+/// intrinsic with an empty deoptimization arguments list. This does
+/// NOT do explicit relocation for GC support.
+static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */
+ Pass *P) {
+ BasicBlock *BB = CS.getInstruction()->getParent();
+ Function *F = BB->getParent();
+ Module *M = F->getParent();
+ assert(M && "must be set");
+
+ // TODO: technically, a pass is not allowed to get functions from within a
+ // function pass since it might trigger a new function addition. Refactor
+ // this logic out to the initialization of the pass. Doesn't appear to
+ // matter in practice.
+
+ // Then go ahead and use the builder do actually do the inserts. We insert
+ // immediately before the previous instruction under the assumption that all
+ // arguments will be available here. We can't insert afterwards since we may
+ // be replacing a terminator.
+ Instruction *insertBefore = CS.getInstruction();
+ IRBuilder<> Builder(insertBefore);
+
+ // Note: The gc args are not filled in at this time, that's handled by
+ // RewriteStatepointsForGC (which is currently under review).
+
+ // Create the statepoint given all the arguments
+ Instruction *token = nullptr;
+ AttributeSet return_attributes;
+ if (CS.isCall()) {
+ CallInst *toReplace = cast<CallInst>(CS.getInstruction());
+ CallInst *Call = Builder.CreateGCStatepoint(
+ CS.getCalledValue(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None,
+ None, "safepoint_token");
+ Call->setTailCall(toReplace->isTailCall());
+ Call->setCallingConv(toReplace->getCallingConv());
+
+ // Before we have to worry about GC semantics, all attributes are legal
+ AttributeSet new_attrs = toReplace->getAttributes();
+ // In case if we can handle this set of sttributes - set up function attrs
+ // directly on statepoint and return attrs later for gc_result intrinsic.
+ Call->setAttributes(new_attrs.getFnAttributes());
+ return_attributes = new_attrs.getRetAttributes();
+ // TODO: handle param attributes
+
+ token = Call;
+
+ // Put the following gc_result and gc_relocate calls immediately after the
+ // the old call (which we're about to delete)
+ BasicBlock::iterator next(toReplace);
+ assert(BB->end() != next && "not a terminator, must have next");
+ next++;
+ Instruction *IP = &*(next);
+ Builder.SetInsertPoint(IP);
+ Builder.SetCurrentDebugLocation(IP->getDebugLoc());
+
+ } else if (CS.isInvoke()) {
+ // TODO: make CreateGCStatepoint return an Instruction that we can cast to a
+ // Call or Invoke, instead of doing this junk here.
+
+ // Fill in the one generic type'd argument (the function is also
+ // vararg)
+ std::vector<Type *> argTypes;
+ argTypes.push_back(CS.getCalledValue()->getType());
+
+ Function *gc_statepoint_decl = Intrinsic::getDeclaration(
+ M, Intrinsic::experimental_gc_statepoint, argTypes);
+
+ // First, create the statepoint (with all live ptrs as arguments).
+ std::vector<llvm::Value *> args;
+ // target, #call args, unused, ... call parameters, #deopt args, ... deopt
+ // parameters, ... gc parameters
+ Value *Target = CS.getCalledValue();
+ args.push_back(Target);
+ int callArgSize = CS.arg_size();
+ // #call args
+ args.push_back(Builder.getInt32(callArgSize));
+ // unused
+ args.push_back(Builder.getInt32(0));
+ // call parameters
+ args.insert(args.end(), CS.arg_begin(), CS.arg_end());
+ // #deopt args: 0
+ args.push_back(Builder.getInt32(0));
+
+ InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction());
+
+ // Insert the new invoke into the old block. We'll remove the old one in a
+ // moment at which point this will become the new terminator for the
+ // original block.
+ InvokeInst *invoke = InvokeInst::Create(
+ gc_statepoint_decl, toReplace->getNormalDest(),
+ toReplace->getUnwindDest(), args, "", toReplace->getParent());
+ invoke->setCallingConv(toReplace->getCallingConv());
+
+ // Currently we will fail on parameter attributes and on certain
+ // function attributes.
+ AttributeSet new_attrs = toReplace->getAttributes();
+ // In case if we can handle this set of sttributes - set up function attrs
+ // directly on statepoint and return attrs later for gc_result intrinsic.
+ invoke->setAttributes(new_attrs.getFnAttributes());
+ return_attributes = new_attrs.getRetAttributes();
+
+ token = invoke;
+
+ // We'll insert the gc.result into the normal block
+ BasicBlock *normalDest = normalizeBBForInvokeSafepoint(
+ toReplace->getNormalDest(), invoke->getParent());
+ Instruction *IP = &*(normalDest->getFirstInsertionPt());
+ Builder.SetInsertPoint(IP);
+ } else {
+ llvm_unreachable("unexpect type of CallSite");
+ }
+ assert(token);
+
+ // Handle the return value of the original call - update all uses to use a
+ // gc_result hanging off the statepoint node we just inserted
+
+ // Only add the gc_result iff there is actually a used result
+ if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
+ std::string takenName =
+ CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
+ CallInst *gc_result =
+ Builder.CreateGCResult(token, CS.getType(), takenName);
+ gc_result->setAttributes(return_attributes);
+ return gc_result;
+ } else {
+ // No return value for the call.
+ return nullptr;
+ }
+}
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 1bbaaf3..98016b4 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -917,10 +917,13 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
/// version of the value is returned, and BI is left pointing at the instruction
/// that should be processed next by the reassociation pass.
static Value *NegateValue(Value *V, Instruction *BI) {
- if (ConstantFP *C = dyn_cast<ConstantFP>(V))
- return ConstantExpr::getFNeg(C);
- if (Constant *C = dyn_cast<Constant>(V))
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ if (C->getType()->isFPOrFPVectorTy()) {
+ return ConstantExpr::getFNeg(C);
+ }
return ConstantExpr::getNeg(C);
+ }
+
// We are trying to expose opportunity for reassociation. One of the things
// that we want to do to achieve this is to push a negation as deep into an
@@ -1512,7 +1515,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
++NumFound;
} while (i != Ops.size() && Ops[i].Op == TheOp);
- DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+ DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
++NumFactor;
// Insert a new multiply.
@@ -1650,7 +1653,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
// If any factor occurred more than one time, we can pull it out.
if (MaxOcc > 1) {
- DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+ DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
++NumFactor;
// Create a new instruction that uses the MaxOccVal twice. If we don't do
@@ -1988,7 +1991,7 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {
Constant *C = C0 ? C0 : C1;
unsigned ConstIdx = C0 ? 0 : 1;
if (auto *CI = dyn_cast<ConstantInt>(C)) {
- if (!CI->isNegative())
+ if (!CI->isNegative() || CI->isMinValue(true))
return nullptr;
} else if (auto *CF = dyn_cast<ConstantFP>(C)) {
if (!CF->isNegative())
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index b6023e2..1b46727 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -73,7 +73,7 @@ bool RegToMem::runOnFunction(Function &F) {
// Insert all new allocas into entry block.
BasicBlock *BBEntry = &F.getEntryBlock();
- assert(pred_begin(BBEntry) == pred_end(BBEntry) &&
+ assert(pred_empty(BBEntry) &&
"Entry block to function must not have predecessors!");
// Find first non-alloca instruction and create insertion point. This is
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
new file mode 100644
index 0000000..ca9ab54
--- /dev/null
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -0,0 +1,1897 @@
+//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Rewrite an existing set of gc.statepoints such that they make potential
+// relocations performed by the garbage collector explicit in the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+#define DEBUG_TYPE "rewrite-statepoints-for-gc"
+
+using namespace llvm;
+
+// Print tracing output
+static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden,
+ cl::init(false));
+
+// Print the liveset found at the insert location
+static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
+ cl::init(false));
+static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size",
+ cl::Hidden, cl::init(false));
+// Print out the base pointers for debugging
+static cl::opt<bool> PrintBasePointers("spp-print-base-pointers",
+ cl::Hidden, cl::init(false));
+
+namespace {
+struct RewriteStatepointsForGC : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ RewriteStatepointsForGC() : FunctionPass(ID) {
+ initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We add and rewrite a bunch of instructions, but don't really do much
+ // else. We could in theory preserve a lot more analyses here.
+ AU.addRequired<DominatorTreeWrapperPass>();
+ }
+};
+} // namespace
+
+char RewriteStatepointsForGC::ID = 0;
+
+FunctionPass *llvm::createRewriteStatepointsForGCPass() {
+ return new RewriteStatepointsForGC();
+}
+
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+ "Make relocations explicit at statepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+ "Make relocations explicit at statepoints", false, false)
+
+namespace {
+// The type of the internal cache used inside the findBasePointers family
+// of functions. From the callers perspective, this is an opaque type and
+// should not be inspected.
+//
+// In the actual implementation this caches two relations:
+// - The base relation itself (i.e. this pointer is based on that one)
+// - The base defining value relation (i.e. before base_phi insertion)
+// Generally, after the execution of a full findBasePointer call, only the
+// base relation will remain. Internally, we add a mixture of the two
+// types, then update all the second type to the first type
+typedef DenseMap<Value *, Value *> DefiningValueMapTy;
+typedef DenseSet<llvm::Value *> StatepointLiveSetTy;
+
+struct PartiallyConstructedSafepointRecord {
+ /// The set of values known to be live accross this safepoint
+ StatepointLiveSetTy liveset;
+
+ /// Mapping from live pointers to a base-defining-value
+ DenseMap<llvm::Value *, llvm::Value *> PointerToBase;
+
+ /// Any new values which were added to the IR during base pointer analysis
+ /// for this safepoint
+ DenseSet<llvm::Value *> NewInsertedDefs;
+
+ /// The *new* gc.statepoint instruction itself. This produces the token
+ /// that normal path gc.relocates and the gc.result are tied to.
+ Instruction *StatepointToken;
+
+ /// Instruction to which exceptional gc relocates are attached
+ /// Makes it easier to iterate through them during relocationViaAlloca.
+ Instruction *UnwindToken;
+};
+}
+
+// TODO: Once we can get to the GCStrategy, this becomes
+// Optional<bool> isGCManagedPointer(const Value *V) const override {
+
+static bool isGCPointerType(const Type *T) {
+ if (const PointerType *PT = dyn_cast<PointerType>(T))
+ // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+ // GC managed heap. We know that a pointer into this heap needs to be
+ // updated and that no other pointer does.
+ return (1 == PT->getAddressSpace());
+ return false;
+}
+
+/// Return true if the Value is a gc reference type which is potentially used
+/// after the instruction 'loc'. This is only used with the edge reachability
+/// liveness code. Note: It is assumed the V dominates loc.
+static bool isLiveGCReferenceAt(Value &V, Instruction *loc, DominatorTree &DT,
+ LoopInfo *LI) {
+ if (!isGCPointerType(V.getType()))
+ return false;
+
+ if (V.use_empty())
+ return false;
+
+ // Given assumption that V dominates loc, this may be live
+ return true;
+}
+
+#ifndef NDEBUG
+static bool isAggWhichContainsGCPtrType(Type *Ty) {
+ if (VectorType *VT = dyn_cast<VectorType>(Ty))
+ return isGCPointerType(VT->getScalarType());
+ if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
+ return isGCPointerType(AT->getElementType()) ||
+ isAggWhichContainsGCPtrType(AT->getElementType());
+ if (StructType *ST = dyn_cast<StructType>(Ty))
+ return std::any_of(ST->subtypes().begin(), ST->subtypes().end(),
+ [](Type *SubType) {
+ return isGCPointerType(SubType) ||
+ isAggWhichContainsGCPtrType(SubType);
+ });
+ return false;
+}
+#endif
+
+// Conservatively identifies any definitions which might be live at the
+// given instruction. The analysis is performed immediately before the
+// given instruction. Values defined by that instruction are not considered
+// live. Values used by that instruction are considered live.
+//
+// preconditions: valid IR graph, term is either a terminator instruction or
+// a call instruction, pred is the basic block of term, DT, LI are valid
+//
+// side effects: none, does not mutate IR
+//
+// postconditions: populates liveValues as discussed above
+static void findLiveGCValuesAtInst(Instruction *term, BasicBlock *pred,
+ DominatorTree &DT, LoopInfo *LI,
+ StatepointLiveSetTy &liveValues) {
+ liveValues.clear();
+
+ assert(isa<CallInst>(term) || isa<InvokeInst>(term) || term->isTerminator());
+
+ Function *F = pred->getParent();
+
+ auto is_live_gc_reference =
+ [&](Value &V) { return isLiveGCReferenceAt(V, term, DT, LI); };
+
+ // Are there any gc pointer arguments live over this point? This needs to be
+ // special cased since arguments aren't defined in basic blocks.
+ for (Argument &arg : F->args()) {
+ assert(!isAggWhichContainsGCPtrType(arg.getType()) &&
+ "support for FCA unimplemented");
+
+ if (is_live_gc_reference(arg)) {
+ liveValues.insert(&arg);
+ }
+ }
+
+ // Walk through all dominating blocks - the ones which can contain
+ // definitions used in this block - and check to see if any of the values
+ // they define are used in locations potentially reachable from the
+ // interesting instruction.
+ BasicBlock *BBI = pred;
+ while (true) {
+ if (TraceLSP) {
+ errs() << "[LSP] Looking at dominating block " << pred->getName() << "\n";
+ }
+ assert(DT.dominates(BBI, pred));
+ assert(isPotentiallyReachable(BBI, pred, &DT) &&
+ "dominated block must be reachable");
+
+ // Walk through the instructions in dominating blocks and keep any
+ // that have a use potentially reachable from the block we're
+ // considering putting the safepoint in
+ for (Instruction &inst : *BBI) {
+ if (TraceLSP) {
+ errs() << "[LSP] Looking at instruction ";
+ inst.dump();
+ }
+
+ if (pred == BBI && (&inst) == term) {
+ if (TraceLSP) {
+ errs() << "[LSP] stopped because we encountered the safepoint "
+ "instruction.\n";
+ }
+
+ // If we're in the block which defines the interesting instruction,
+ // we don't want to include any values as live which are defined
+ // _after_ the interesting line or as part of the line itself
+ // i.e. "term" is the call instruction for a call safepoint, the
+ // results of the call should not be considered live in that stackmap
+ break;
+ }
+
+ assert(!isAggWhichContainsGCPtrType(inst.getType()) &&
+ "support for FCA unimplemented");
+
+ if (is_live_gc_reference(inst)) {
+ if (TraceLSP) {
+ errs() << "[LSP] found live value for this safepoint ";
+ inst.dump();
+ term->dump();
+ }
+ liveValues.insert(&inst);
+ }
+ }
+ if (!DT.getNode(BBI)->getIDom()) {
+ assert(BBI == &F->getEntryBlock() &&
+ "failed to find a dominator for something other than "
+ "the entry block");
+ break;
+ }
+ BBI = DT.getNode(BBI)->getIDom()->getBlock();
+ }
+}
+
+static bool order_by_name(llvm::Value *a, llvm::Value *b) {
+ if (a->hasName() && b->hasName()) {
+ return -1 == a->getName().compare(b->getName());
+ } else if (a->hasName() && !b->hasName()) {
+ return true;
+ } else if (!a->hasName() && b->hasName()) {
+ return false;
+ } else {
+ // Better than nothing, but not stable
+ return a < b;
+ }
+}
+
+/// Find the initial live set. Note that due to base pointer
+/// insertion, the live set may be incomplete.
+static void
+analyzeParsePointLiveness(DominatorTree &DT, const CallSite &CS,
+ PartiallyConstructedSafepointRecord &result) {
+ Instruction *inst = CS.getInstruction();
+
+ BasicBlock *BB = inst->getParent();
+ StatepointLiveSetTy liveset;
+ findLiveGCValuesAtInst(inst, BB, DT, nullptr, liveset);
+
+ if (PrintLiveSet) {
+ // Note: This output is used by several of the test cases
+ // The order of elemtns in a set is not stable, put them in a vec and sort
+ // by name
+ SmallVector<Value *, 64> temp;
+ temp.insert(temp.end(), liveset.begin(), liveset.end());
+ std::sort(temp.begin(), temp.end(), order_by_name);
+ errs() << "Live Variables:\n";
+ for (Value *V : temp) {
+ errs() << " " << V->getName(); // no newline
+ V->dump();
+ }
+ }
+ if (PrintLiveSetSize) {
+ errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n";
+ errs() << "Number live values: " << liveset.size() << "\n";
+ }
+ result.liveset = liveset;
+}
+
+/// True iff this value is the null pointer constant (of any pointer type)
+static bool LLVM_ATTRIBUTE_UNUSED isNullConstant(Value *V) {
+ return isa<Constant>(V) && isa<PointerType>(V->getType()) &&
+ cast<Constant>(V)->isNullValue();
+}
+
+/// Helper function for findBasePointer - Will return a value which either a)
+/// defines the base pointer for the input or b) blocks the simple search
+/// (i.e. a PHI or Select of two derived pointers)
+static Value *findBaseDefiningValue(Value *I) {
+ assert(I->getType()->isPointerTy() &&
+ "Illegal to ask for the base pointer of a non-pointer type");
+
+ // There are instructions which can never return gc pointer values. Sanity
+ // check
+ // that this is actually true.
+ assert(!isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+ !isa<ShuffleVectorInst>(I) && "Vector types are not gc pointers");
+ assert((!isa<Instruction>(I) || isa<InvokeInst>(I) ||
+ !cast<Instruction>(I)->isTerminator()) &&
+ "With the exception of invoke terminators don't define values");
+ assert(!isa<StoreInst>(I) && !isa<FenceInst>(I) &&
+ "Can't be definitions to start with");
+ assert(!isa<ICmpInst>(I) && !isa<FCmpInst>(I) &&
+ "Comparisons don't give ops");
+ // There's a bunch of instructions which just don't make sense to apply to
+ // a pointer. The only valid reason for this would be pointer bit
+ // twiddling which we're just not going to support.
+ assert((!isa<Instruction>(I) || !cast<Instruction>(I)->isBinaryOp()) &&
+ "Binary ops on pointer values are meaningless. Unless your "
+ "bit-twiddling which we don't support");
+
+ if (Argument *Arg = dyn_cast<Argument>(I)) {
+ // An incoming argument to the function is a base pointer
+ // We should have never reached here if this argument isn't an gc value
+ assert(Arg->getType()->isPointerTy() &&
+ "Base for pointer must be another pointer");
+ return Arg;
+ }
+
+ if (GlobalVariable *global = dyn_cast<GlobalVariable>(I)) {
+ // base case
+ assert(global->getType()->isPointerTy() &&
+ "Base for pointer must be another pointer");
+ return global;
+ }
+
+ // inlining could possibly introduce phi node that contains
+ // undef if callee has multiple returns
+ if (UndefValue *undef = dyn_cast<UndefValue>(I)) {
+ assert(undef->getType()->isPointerTy() &&
+ "Base for pointer must be another pointer");
+ return undef; // utterly meaningless, but useful for dealing with
+ // partially optimized code.
+ }
+
+ // Due to inheritance, this must be _after_ the global variable and undef
+ // checks
+ if (Constant *con = dyn_cast<Constant>(I)) {
+ assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) &&
+ "order of checks wrong!");
+ // Note: Finding a constant base for something marked for relocation
+ // doesn't really make sense. The most likely case is either a) some
+ // screwed up the address space usage or b) your validating against
+ // compiled C++ code w/o the proper separation. The only real exception
+ // is a null pointer. You could have generic code written to index of
+ // off a potentially null value and have proven it null. We also use
+ // null pointers in dead paths of relocation phis (which we might later
+ // want to find a base pointer for).
+ assert(con->getType()->isPointerTy() &&
+ "Base for pointer must be another pointer");
+ assert(con->isNullValue() && "null is the only case which makes sense");
+ return con;
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(I)) {
+ Value *def = CI->stripPointerCasts();
+ assert(def->getType()->isPointerTy() &&
+ "Base for pointer must be another pointer");
+ // If we find a cast instruction here, it means we've found a cast which is
+ // not simply a pointer cast (i.e. an inttoptr). We don't know how to
+ // handle int->ptr conversion.
+ assert(!isa<CastInst>(def) && "shouldn't find another cast here");
+ return findBaseDefiningValue(def);
+ }
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (LI->getType()->isPointerTy()) {
+ Value *Op = LI->getOperand(0);
+ (void)Op;
+ // Has to be a pointer to an gc object, or possibly an array of such?
+ assert(Op->getType()->isPointerTy());
+ return LI; // The value loaded is an gc base itself
+ }
+ }
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ Value *Op = GEP->getOperand(0);
+ if (Op->getType()->isPointerTy()) {
+ return findBaseDefiningValue(Op); // The base of this GEP is the base
+ }
+ }
+
+ if (AllocaInst *alloc = dyn_cast<AllocaInst>(I)) {
+ // An alloca represents a conceptual stack slot. It's the slot itself
+ // that the GC needs to know about, not the value in the slot.
+ assert(alloc->getType()->isPointerTy() &&
+ "Base for pointer must be another pointer");
+ return alloc;
+ }
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ // fall through to general call handling
+ break;
+ case Intrinsic::experimental_gc_statepoint:
+ case Intrinsic::experimental_gc_result_float:
+ case Intrinsic::experimental_gc_result_int:
+ llvm_unreachable("these don't produce pointers");
+ case Intrinsic::experimental_gc_result_ptr:
+ // This is just a special case of the CallInst check below to handle a
+ // statepoint with deopt args which hasn't been rewritten for GC yet.
+ // TODO: Assert that the statepoint isn't rewritten yet.
+ return II;
+ case Intrinsic::experimental_gc_relocate: {
+ // Rerunning safepoint insertion after safepoints are already
+ // inserted is not supported. It could probably be made to work,
+ // but why are you doing this? There's no good reason.
+ llvm_unreachable("repeat safepoint insertion is not supported");
+ }
+ case Intrinsic::gcroot:
+ // Currently, this mechanism hasn't been extended to work with gcroot.
+ // There's no reason it couldn't be, but I haven't thought about the
+ // implications much.
+ llvm_unreachable(
+ "interaction with the gcroot mechanism is not supported");
+ }
+ }
+ // We assume that functions in the source language only return base
+ // pointers. This should probably be generalized via attributes to support
+ // both source language and internal functions.
+ if (CallInst *call = dyn_cast<CallInst>(I)) {
+ assert(call->getType()->isPointerTy() &&
+ "Base for pointer must be another pointer");
+ return call;
+ }
+ if (InvokeInst *invoke = dyn_cast<InvokeInst>(I)) {
+ assert(invoke->getType()->isPointerTy() &&
+ "Base for pointer must be another pointer");
+ return invoke;
+ }
+
+ // I have absolutely no idea how to implement this part yet. It's not
+ // neccessarily hard, I just haven't really looked at it yet.
+ assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
+
+ if (AtomicCmpXchgInst *cas = dyn_cast<AtomicCmpXchgInst>(I)) {
+ // A CAS is effectively a atomic store and load combined under a
+ // predicate. From the perspective of base pointers, we just treat it
+ // like a load. We loaded a pointer from a address in memory, that value
+ // had better be a valid base pointer.
+ return cas->getPointerOperand();
+ }
+ if (AtomicRMWInst *atomic = dyn_cast<AtomicRMWInst>(I)) {
+ assert(AtomicRMWInst::Xchg == atomic->getOperation() &&
+ "All others are binary ops which don't apply to base pointers");
+ // semantically, a load, store pair. Treat it the same as a standard load
+ return atomic->getPointerOperand();
+ }
+
+ // The aggregate ops. Aggregates can either be in the heap or on the
+ // stack, but in either case, this is simply a field load. As a result,
+ // this is a defining definition of the base just like a load is.
+ if (ExtractValueInst *ev = dyn_cast<ExtractValueInst>(I)) {
+ return ev;
+ }
+
+ // We should never see an insert vector since that would require we be
+ // tracing back a struct value not a pointer value.
+ assert(!isa<InsertValueInst>(I) &&
+ "Base pointer for a struct is meaningless");
+
+ // The last two cases here don't return a base pointer. Instead, they
+ // return a value which dynamically selects from amoung several base
+ // derived pointers (each with it's own base potentially). It's the job of
+ // the caller to resolve these.
+ if (SelectInst *select = dyn_cast<SelectInst>(I)) {
+ return select;
+ }
+
+ return cast<PHINode>(I);
+}
+
+/// Returns the base defining value for this value.
+static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &cache) {
+ Value *&Cached = cache[I];
+ if (!Cached) {
+ Cached = findBaseDefiningValue(I);
+ }
+ assert(cache[I] != nullptr);
+
+ if (TraceLSP) {
+ errs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName()
+ << "\n";
+ }
+ return Cached;
+}
+
+/// Return a base pointer for this value if known. Otherwise, return it's
+/// base defining value.
+static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &cache) {
+ Value *def = findBaseDefiningValueCached(I, cache);
+ auto Found = cache.find(def);
+ if (Found != cache.end()) {
+ // Either a base-of relation, or a self reference. Caller must check.
+ return Found->second;
+ }
+ // Only a BDV available
+ return def;
+}
+
+/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
+/// is it known to be a base pointer? Or do we need to continue searching.
+static bool isKnownBaseResult(Value *v) {
+ if (!isa<PHINode>(v) && !isa<SelectInst>(v)) {
+ // no recursion possible
+ return true;
+ }
+ if (cast<Instruction>(v)->getMetadata("is_base_value")) {
+ // This is a previously inserted base phi or select. We know
+ // that this is a base value.
+ return true;
+ }
+
+ // We need to keep searching
+ return false;
+}
+
+// TODO: find a better name for this
+namespace {
+class PhiState {
+public:
+ enum Status { Unknown, Base, Conflict };
+
+ PhiState(Status s, Value *b = nullptr) : status(s), base(b) {
+ assert(status != Base || b);
+ }
+ PhiState(Value *b) : status(Base), base(b) {}
+ PhiState() : status(Unknown), base(nullptr) {}
+ PhiState(const PhiState &other) : status(other.status), base(other.base) {
+ assert(status != Base || base);
+ }
+
+ Status getStatus() const { return status; }
+ Value *getBase() const { return base; }
+
+ bool isBase() const { return getStatus() == Base; }
+ bool isUnknown() const { return getStatus() == Unknown; }
+ bool isConflict() const { return getStatus() == Conflict; }
+
+ bool operator==(const PhiState &other) const {
+ return base == other.base && status == other.status;
+ }
+
+ bool operator!=(const PhiState &other) const { return !(*this == other); }
+
+ void dump() {
+ errs() << status << " (" << base << " - "
+ << (base ? base->getName() : "nullptr") << "): ";
+ }
+
+private:
+ Status status;
+ Value *base; // non null only if status == base
+};
+
+typedef DenseMap<Value *, PhiState> ConflictStateMapTy;
+// Values of type PhiState form a lattice, and this is a helper
+// class that implementes the meet operation. The meat of the meet
+// operation is implemented in MeetPhiStates::pureMeet
+class MeetPhiStates {
+public:
+ // phiStates is a mapping from PHINodes and SelectInst's to PhiStates.
+ explicit MeetPhiStates(const ConflictStateMapTy &phiStates)
+ : phiStates(phiStates) {}
+
+ // Destructively meet the current result with the base V. V can
+ // either be a merge instruction (SelectInst / PHINode), in which
+ // case its status is looked up in the phiStates map; or a regular
+ // SSA value, in which case it is assumed to be a base.
+ void meetWith(Value *V) {
+ PhiState otherState = getStateForBDV(V);
+ assert((MeetPhiStates::pureMeet(otherState, currentResult) ==
+ MeetPhiStates::pureMeet(currentResult, otherState)) &&
+ "math is wrong: meet does not commute!");
+ currentResult = MeetPhiStates::pureMeet(otherState, currentResult);
+ }
+
+ PhiState getResult() const { return currentResult; }
+
+private:
+ const ConflictStateMapTy &phiStates;
+ PhiState currentResult;
+
+ /// Return a phi state for a base defining value. We'll generate a new
+ /// base state for known bases and expect to find a cached state otherwise
+ PhiState getStateForBDV(Value *baseValue) {
+ if (isKnownBaseResult(baseValue)) {
+ return PhiState(baseValue);
+ } else {
+ return lookupFromMap(baseValue);
+ }
+ }
+
+ PhiState lookupFromMap(Value *V) {
+ auto I = phiStates.find(V);
+ assert(I != phiStates.end() && "lookup failed!");
+ return I->second;
+ }
+
+ static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) {
+ switch (stateA.getStatus()) {
+ case PhiState::Unknown:
+ return stateB;
+
+ case PhiState::Base:
+ assert(stateA.getBase() && "can't be null");
+ if (stateB.isUnknown())
+ return stateA;
+
+ if (stateB.isBase()) {
+ if (stateA.getBase() == stateB.getBase()) {
+ assert(stateA == stateB && "equality broken!");
+ return stateA;
+ }
+ return PhiState(PhiState::Conflict);
+ }
+ assert(stateB.isConflict() && "only three states!");
+ return PhiState(PhiState::Conflict);
+
+ case PhiState::Conflict:
+ return stateA;
+ }
+ llvm_unreachable("only three states!");
+ }
+};
+}
+/// For a given value or instruction, figure out what base ptr it's derived
+/// from. For gc objects, this is simply itself. On success, returns a value
+/// which is the base pointer. (This is reliable and can be used for
+/// relocation.) On failure, returns nullptr.
+static Value *findBasePointer(Value *I, DefiningValueMapTy &cache,
+ DenseSet<llvm::Value *> &NewInsertedDefs) {
+ Value *def = findBaseOrBDV(I, cache);
+
+ if (isKnownBaseResult(def)) {
+ return def;
+ }
+
+ // Here's the rough algorithm:
+ // - For every SSA value, construct a mapping to either an actual base
+ // pointer or a PHI which obscures the base pointer.
+ // - Construct a mapping from PHI to unknown TOP state. Use an
+ // optimistic algorithm to propagate base pointer information. Lattice
+ // looks like:
+ // UNKNOWN
+ // b1 b2 b3 b4
+ // CONFLICT
+ // When algorithm terminates, all PHIs will either have a single concrete
+ // base or be in a conflict state.
+ // - For every conflict, insert a dummy PHI node without arguments. Add
+ // these to the base[Instruction] = BasePtr mapping. For every
+ // non-conflict, add the actual base.
+ // - For every conflict, add arguments for the base[a] of each input
+ // arguments.
+ //
+ // Note: A simpler form of this would be to add the conflict form of all
+ // PHIs without running the optimistic algorithm. This would be
+ // analougous to pessimistic data flow and would likely lead to an
+ // overall worse solution.
+
+ ConflictStateMapTy states;
+ states[def] = PhiState();
+ // Recursively fill in all phis & selects reachable from the initial one
+ // for which we don't already know a definite base value for
+ // PERF: Yes, this is as horribly inefficient as it looks.
+ bool done = false;
+ while (!done) {
+ done = true;
+ for (auto Pair : states) {
+ Value *v = Pair.first;
+ assert(!isKnownBaseResult(v) && "why did it get added?");
+ if (PHINode *phi = dyn_cast<PHINode>(v)) {
+ assert(phi->getNumIncomingValues() > 0 &&
+ "zero input phis are illegal");
+ for (Value *InVal : phi->incoming_values()) {
+ Value *local = findBaseOrBDV(InVal, cache);
+ if (!isKnownBaseResult(local) && states.find(local) == states.end()) {
+ states[local] = PhiState();
+ done = false;
+ }
+ }
+ } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) {
+ Value *local = findBaseOrBDV(sel->getTrueValue(), cache);
+ if (!isKnownBaseResult(local) && states.find(local) == states.end()) {
+ states[local] = PhiState();
+ done = false;
+ }
+ local = findBaseOrBDV(sel->getFalseValue(), cache);
+ if (!isKnownBaseResult(local) && states.find(local) == states.end()) {
+ states[local] = PhiState();
+ done = false;
+ }
+ }
+ }
+ }
+
+ if (TraceLSP) {
+ errs() << "States after initialization:\n";
+ for (auto Pair : states) {
+ Instruction *v = cast<Instruction>(Pair.first);
+ PhiState state = Pair.second;
+ state.dump();
+ v->dump();
+ }
+ }
+
+ // TODO: come back and revisit the state transitions around inputs which
+ // have reached conflict state. The current version seems too conservative.
+
+ bool progress = true;
+ size_t oldSize = 0;
+ while (progress) {
+ oldSize = states.size();
+ progress = false;
+ for (auto Pair : states) {
+ MeetPhiStates calculateMeet(states);
+ Value *v = Pair.first;
+ assert(!isKnownBaseResult(v) && "why did it get added?");
+ if (SelectInst *select = dyn_cast<SelectInst>(v)) {
+ calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache));
+ calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache));
+ } else
+ for (Value *Val : cast<PHINode>(v)->incoming_values())
+ calculateMeet.meetWith(findBaseOrBDV(Val, cache));
+
+ PhiState oldState = states[v];
+ PhiState newState = calculateMeet.getResult();
+ if (oldState != newState) {
+ progress = true;
+ states[v] = newState;
+ }
+ }
+
+ assert(oldSize <= states.size());
+ assert(oldSize == states.size() || progress);
+ }
+
+ if (TraceLSP) {
+ errs() << "States after meet iteration:\n";
+ for (auto Pair : states) {
+ Instruction *v = cast<Instruction>(Pair.first);
+ PhiState state = Pair.second;
+ state.dump();
+ v->dump();
+ }
+ }
+
+ // Insert Phis for all conflicts
+ for (auto Pair : states) {
+ Instruction *v = cast<Instruction>(Pair.first);
+ PhiState state = Pair.second;
+ assert(!isKnownBaseResult(v) && "why did it get added?");
+ assert(!state.isUnknown() && "Optimistic algorithm didn't complete!");
+ if (state.isConflict()) {
+ if (isa<PHINode>(v)) {
+ int num_preds =
+ std::distance(pred_begin(v->getParent()), pred_end(v->getParent()));
+ assert(num_preds > 0 && "how did we reach here");
+ PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v);
+ NewInsertedDefs.insert(phi);
+ // Add metadata marking this as a base value
+ auto *const_1 = ConstantInt::get(
+ Type::getInt32Ty(
+ v->getParent()->getParent()->getParent()->getContext()),
+ 1);
+ auto MDConst = ConstantAsMetadata::get(const_1);
+ MDNode *md = MDNode::get(
+ v->getParent()->getParent()->getParent()->getContext(), MDConst);
+ phi->setMetadata("is_base_value", md);
+ states[v] = PhiState(PhiState::Conflict, phi);
+ } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) {
+ // The undef will be replaced later
+ UndefValue *undef = UndefValue::get(sel->getType());
+ SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef,
+ undef, "base_select", sel);
+ NewInsertedDefs.insert(basesel);
+ // Add metadata marking this as a base value
+ auto *const_1 = ConstantInt::get(
+ Type::getInt32Ty(
+ v->getParent()->getParent()->getParent()->getContext()),
+ 1);
+ auto MDConst = ConstantAsMetadata::get(const_1);
+ MDNode *md = MDNode::get(
+ v->getParent()->getParent()->getParent()->getContext(), MDConst);
+ basesel->setMetadata("is_base_value", md);
+ states[v] = PhiState(PhiState::Conflict, basesel);
+ } else
+ llvm_unreachable("unknown conflict type");
+ }
+ }
+
+ // Fixup all the inputs of the new PHIs
+ for (auto Pair : states) {
+ Instruction *v = cast<Instruction>(Pair.first);
+ PhiState state = Pair.second;
+
+ assert(!isKnownBaseResult(v) && "why did it get added?");
+ assert(!state.isUnknown() && "Optimistic algorithm didn't complete!");
+ if (state.isConflict()) {
+ if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) {
+ PHINode *phi = cast<PHINode>(v);
+ unsigned NumPHIValues = phi->getNumIncomingValues();
+ for (unsigned i = 0; i < NumPHIValues; i++) {
+ Value *InVal = phi->getIncomingValue(i);
+ BasicBlock *InBB = phi->getIncomingBlock(i);
+
+ // If we've already seen InBB, add the same incoming value
+ // we added for it earlier. The IR verifier requires phi
+ // nodes with multiple entries from the same basic block
+ // to have the same incoming value for each of those
+ // entries. If we don't do this check here and basephi
+ // has a different type than base, we'll end up adding two
+ // bitcasts (and hence two distinct values) as incoming
+ // values for the same basic block.
+
+ int blockIndex = basephi->getBasicBlockIndex(InBB);
+ if (blockIndex != -1) {
+ Value *oldBase = basephi->getIncomingValue(blockIndex);
+ basephi->addIncoming(oldBase, InBB);
+#ifndef NDEBUG
+ Value *base = findBaseOrBDV(InVal, cache);
+ if (!isKnownBaseResult(base)) {
+ // Either conflict or base.
+ assert(states.count(base));
+ base = states[base].getBase();
+ assert(base != nullptr && "unknown PhiState!");
+ assert(NewInsertedDefs.count(base) &&
+ "should have already added this in a prev. iteration!");
+ }
+
+ // In essense this assert states: the only way two
+ // values incoming from the same basic block may be
+ // different is by being different bitcasts of the same
+ // value. A cleanup that remains TODO is changing
+ // findBaseOrBDV to return an llvm::Value of the correct
+ // type (and still remain pure). This will remove the
+ // need to add bitcasts.
+ assert(base->stripPointerCasts() == oldBase->stripPointerCasts() &&
+ "sanity -- findBaseOrBDV should be pure!");
+#endif
+ continue;
+ }
+
+ // Find either the defining value for the PHI or the normal base for
+ // a non-phi node
+ Value *base = findBaseOrBDV(InVal, cache);
+ if (!isKnownBaseResult(base)) {
+ // Either conflict or base.
+ assert(states.count(base));
+ base = states[base].getBase();
+ assert(base != nullptr && "unknown PhiState!");
+ }
+ assert(base && "can't be null");
+ // Must use original input BB since base may not be Instruction
+ // The cast is needed since base traversal may strip away bitcasts
+ if (base->getType() != basephi->getType()) {
+ base = new BitCastInst(base, basephi->getType(), "cast",
+ InBB->getTerminator());
+ NewInsertedDefs.insert(base);
+ }
+ basephi->addIncoming(base, InBB);
+ }
+ assert(basephi->getNumIncomingValues() == NumPHIValues);
+ } else if (SelectInst *basesel = dyn_cast<SelectInst>(state.getBase())) {
+ SelectInst *sel = cast<SelectInst>(v);
+ // Operand 1 & 2 are true, false path respectively. TODO: refactor to
+ // something more safe and less hacky.
+ for (int i = 1; i <= 2; i++) {
+ Value *InVal = sel->getOperand(i);
+ // Find either the defining value for the PHI or the normal base for
+ // a non-phi node
+ Value *base = findBaseOrBDV(InVal, cache);
+ if (!isKnownBaseResult(base)) {
+ // Either conflict or base.
+ assert(states.count(base));
+ base = states[base].getBase();
+ assert(base != nullptr && "unknown PhiState!");
+ }
+ assert(base && "can't be null");
+ // Must use original input BB since base may not be Instruction
+ // The cast is needed since base traversal may strip away bitcasts
+ if (base->getType() != basesel->getType()) {
+ base = new BitCastInst(base, basesel->getType(), "cast", basesel);
+ NewInsertedDefs.insert(base);
+ }
+ basesel->setOperand(i, base);
+ }
+ } else
+ llvm_unreachable("unexpected conflict type");
+ }
+ }
+
+ // Cache all of our results so we can cheaply reuse them
+ // NOTE: This is actually two caches: one of the base defining value
+ // relation and one of the base pointer relation! FIXME
+ for (auto item : states) {
+ Value *v = item.first;
+ Value *base = item.second.getBase();
+ assert(v && base);
+ assert(!isKnownBaseResult(v) && "why did it get added?");
+
+ if (TraceLSP) {
+ std::string fromstr =
+ cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "")
+ : "none";
+ errs() << "Updating base value cache"
+ << " for: " << (v->hasName() ? v->getName() : "")
+ << " from: " << fromstr
+ << " to: " << (base->hasName() ? base->getName() : "") << "\n";
+ }
+
+ assert(isKnownBaseResult(base) &&
+ "must be something we 'know' is a base pointer");
+ if (cache.count(v)) {
+ // Once we transition from the BDV relation being store in the cache to
+ // the base relation being stored, it must be stable
+ assert((!isKnownBaseResult(cache[v]) || cache[v] == base) &&
+ "base relation should be stable");
+ }
+ cache[v] = base;
+ }
+ assert(cache.find(def) != cache.end());
+ return cache[def];
+}
+
+// For a set of live pointers (base and/or derived), identify the base
+// pointer of the object which they are derived from. This routine will
+// mutate the IR graph as needed to make the 'base' pointer live at the
+// definition site of 'derived'. This ensures that any use of 'derived' can
+// also use 'base'. This may involve the insertion of a number of
+// additional PHI nodes.
+//
+// preconditions: live is a set of pointer type Values
+//
+// side effects: may insert PHI nodes into the existing CFG, will preserve
+// CFG, will not remove or mutate any existing nodes
+//
+// post condition: PointerToBase contains one (derived, base) pair for every
+// pointer in live. Note that derived can be equal to base if the original
+// pointer was a base pointer.
+static void findBasePointers(const StatepointLiveSetTy &live,
+ DenseMap<llvm::Value *, llvm::Value *> &PointerToBase,
+ DominatorTree *DT, DefiningValueMapTy &DVCache,
+ DenseSet<llvm::Value *> &NewInsertedDefs) {
+ for (Value *ptr : live) {
+ Value *base = findBasePointer(ptr, DVCache, NewInsertedDefs);
+ assert(base && "failed to find base pointer");
+ PointerToBase[ptr] = base;
+ assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
+ DT->dominates(cast<Instruction>(base)->getParent(),
+ cast<Instruction>(ptr)->getParent())) &&
+ "The base we found better dominate the derived pointer");
+
+ // If you see this trip and like to live really dangerously, the code should
+ // be correct, just with idioms the verifier can't handle. You can try
+ // disabling the verifier at your own substaintial risk.
+ assert(!isNullConstant(base) && "the relocation code needs adjustment to "
+ "handle the relocation of a null pointer "
+ "constant without causing false positives "
+ "in the safepoint ir verifier.");
+ }
+}
+
+/// Find the required based pointers (and adjust the live set) for the given
+/// parse point.
+static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
+ const CallSite &CS,
+ PartiallyConstructedSafepointRecord &result) {
+ DenseMap<llvm::Value *, llvm::Value *> PointerToBase;
+ DenseSet<llvm::Value *> NewInsertedDefs;
+ findBasePointers(result.liveset, PointerToBase, &DT, DVCache, NewInsertedDefs);
+
+ if (PrintBasePointers) {
+ errs() << "Base Pairs (w/o Relocation):\n";
+ for (auto Pair : PointerToBase) {
+ errs() << " derived %" << Pair.first->getName() << " base %"
+ << Pair.second->getName() << "\n";
+ }
+ }
+
+ result.PointerToBase = PointerToBase;
+ result.NewInsertedDefs = NewInsertedDefs;
+}
+
+/// Check for liveness of items in the insert defs and add them to the live
+/// and base pointer sets
+static void fixupLiveness(DominatorTree &DT, const CallSite &CS,
+ const DenseSet<Value *> &allInsertedDefs,
+ PartiallyConstructedSafepointRecord &result) {
+ Instruction *inst = CS.getInstruction();
+
+ auto liveset = result.liveset;
+ auto PointerToBase = result.PointerToBase;
+
+ auto is_live_gc_reference =
+ [&](Value &V) { return isLiveGCReferenceAt(V, inst, DT, nullptr); };
+
+ // For each new definition, check to see if a) the definition dominates the
+ // instruction we're interested in, and b) one of the uses of that definition
+ // is edge-reachable from the instruction we're interested in. This is the
+ // same definition of liveness we used in the intial liveness analysis
+ for (Value *newDef : allInsertedDefs) {
+ if (liveset.count(newDef)) {
+ // already live, no action needed
+ continue;
+ }
+
+ // PERF: Use DT to check instruction domination might not be good for
+ // compilation time, and we could change to optimal solution if this
+ // turn to be a issue
+ if (!DT.dominates(cast<Instruction>(newDef), inst)) {
+ // can't possibly be live at inst
+ continue;
+ }
+
+ if (is_live_gc_reference(*newDef)) {
+ // Add the live new defs into liveset and PointerToBase
+ liveset.insert(newDef);
+ PointerToBase[newDef] = newDef;
+ }
+ }
+
+ result.liveset = liveset;
+ result.PointerToBase = PointerToBase;
+}
+
+static void fixupLiveReferences(
+ Function &F, DominatorTree &DT, Pass *P,
+ const DenseSet<llvm::Value *> &allInsertedDefs,
+ ArrayRef<CallSite> toUpdate,
+ MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ const CallSite &CS = toUpdate[i];
+ fixupLiveness(DT, CS, allInsertedDefs, info);
+ }
+}
+
+// Normalize basic block to make it ready to be target of invoke statepoint.
+// It means spliting it to have single predecessor. Return newly created BB
+// ready to be successor of invoke statepoint.
+static BasicBlock *normalizeBBForInvokeSafepoint(BasicBlock *BB,
+ BasicBlock *InvokeParent,
+ Pass *P) {
+ BasicBlock *ret = BB;
+
+ if (!BB->getUniquePredecessor()) {
+ ret = SplitBlockPredecessors(BB, InvokeParent, "");
+ }
+
+ // Another requirement for such basic blocks is to not have any phi nodes.
+ // Since we just ensured that new BB will have single predecessor,
+ // all phi nodes in it will have one value. Here it would be naturall place
+ // to
+ // remove them all. But we can not do this because we are risking to remove
+ // one of the values stored in liveset of another statepoint. We will do it
+ // later after placing all safepoints.
+
+ return ret;
+}
+
+static int find_index(ArrayRef<Value *> livevec, Value *val) {
+ auto itr = std::find(livevec.begin(), livevec.end(), val);
+ assert(livevec.end() != itr);
+ size_t index = std::distance(livevec.begin(), itr);
+ assert(index < livevec.size());
+ return index;
+}
+
+// Create new attribute set containing only attributes which can be transfered
+// from original call to the safepoint.
+static AttributeSet legalizeCallAttributes(AttributeSet AS) {
+ AttributeSet ret;
+
+ for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
+ unsigned index = AS.getSlotIndex(Slot);
+
+ if (index == AttributeSet::ReturnIndex ||
+ index == AttributeSet::FunctionIndex) {
+
+ for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end;
+ ++it) {
+ Attribute attr = *it;
+
+ // Do not allow certain attributes - just skip them
+ // Safepoint can not be read only or read none.
+ if (attr.hasAttribute(Attribute::ReadNone) ||
+ attr.hasAttribute(Attribute::ReadOnly))
+ continue;
+
+ ret = ret.addAttributes(
+ AS.getContext(), index,
+ AttributeSet::get(AS.getContext(), index, AttrBuilder(attr)));
+ }
+ }
+
+ // Just skip parameter attributes for now
+ }
+
+ return ret;
+}
+
+/// Helper function to place all gc relocates necessary for the given
+/// statepoint.
+/// Inputs:
+/// liveVariables - list of variables to be relocated.
+/// liveStart - index of the first live variable.
+/// basePtrs - base pointers.
+/// statepointToken - statepoint instruction to which relocates should be
+/// bound.
+/// Builder - Llvm IR builder to be used to construct new calls.
+void CreateGCRelocates(ArrayRef<llvm::Value *> liveVariables,
+ const int liveStart,
+ ArrayRef<llvm::Value *> basePtrs,
+ Instruction *statepointToken, IRBuilder<> Builder) {
+
+ SmallVector<Instruction *, 64> NewDefs;
+ NewDefs.reserve(liveVariables.size());
+
+ Module *M = statepointToken->getParent()->getParent()->getParent();
+
+ for (unsigned i = 0; i < liveVariables.size(); i++) {
+ // We generate a (potentially) unique declaration for every pointer type
+ // combination. This results is some blow up the function declarations in
+ // the IR, but removes the need for argument bitcasts which shrinks the IR
+ // greatly and makes it much more readable.
+ SmallVector<Type *, 1> types; // one per 'any' type
+ types.push_back(liveVariables[i]->getType()); // result type
+ Value *gc_relocate_decl = Intrinsic::getDeclaration(
+ M, Intrinsic::experimental_gc_relocate, types);
+
+ // Generate the gc.relocate call and save the result
+ Value *baseIdx =
+ ConstantInt::get(Type::getInt32Ty(M->getContext()),
+ liveStart + find_index(liveVariables, basePtrs[i]));
+ Value *liveIdx = ConstantInt::get(
+ Type::getInt32Ty(M->getContext()),
+ liveStart + find_index(liveVariables, liveVariables[i]));
+
+ // only specify a debug name if we can give a useful one
+ Value *reloc = Builder.CreateCall3(
+ gc_relocate_decl, statepointToken, baseIdx, liveIdx,
+ liveVariables[i]->hasName() ? liveVariables[i]->getName() + ".relocated"
+ : "");
+ // Trick CodeGen into thinking there are lots of free registers at this
+ // fake call.
+ cast<CallInst>(reloc)->setCallingConv(CallingConv::Cold);
+
+ NewDefs.push_back(cast<Instruction>(reloc));
+ }
+ assert(NewDefs.size() == liveVariables.size() &&
+ "missing or extra redefinition at safepoint");
+}
+
+static void
+makeStatepointExplicitImpl(const CallSite &CS, /* to replace */
+ const SmallVectorImpl<llvm::Value *> &basePtrs,
+ const SmallVectorImpl<llvm::Value *> &liveVariables,
+ Pass *P,
+ PartiallyConstructedSafepointRecord &result) {
+ assert(basePtrs.size() == liveVariables.size());
+ assert(isStatepoint(CS) &&
+ "This method expects to be rewriting a statepoint");
+
+ BasicBlock *BB = CS.getInstruction()->getParent();
+ assert(BB);
+ Function *F = BB->getParent();
+ assert(F && "must be set");
+ Module *M = F->getParent();
+ (void)M;
+ assert(M && "must be set");
+
+ // We're not changing the function signature of the statepoint since the gc
+ // arguments go into the var args section.
+ Function *gc_statepoint_decl = CS.getCalledFunction();
+
+ // Then go ahead and use the builder do actually do the inserts. We insert
+ // immediately before the previous instruction under the assumption that all
+ // arguments will be available here. We can't insert afterwards since we may
+ // be replacing a terminator.
+ Instruction *insertBefore = CS.getInstruction();
+ IRBuilder<> Builder(insertBefore);
+ // Copy all of the arguments from the original statepoint - this includes the
+ // target, call args, and deopt args
+ SmallVector<llvm::Value *, 64> args;
+ args.insert(args.end(), CS.arg_begin(), CS.arg_end());
+ // TODO: Clear the 'needs rewrite' flag
+
+ // add all the pointers to be relocated (gc arguments)
+ // Capture the start of the live variable list for use in the gc_relocates
+ const int live_start = args.size();
+ args.insert(args.end(), liveVariables.begin(), liveVariables.end());
+
+ // Create the statepoint given all the arguments
+ Instruction *token = nullptr;
+ AttributeSet return_attributes;
+ if (CS.isCall()) {
+ CallInst *toReplace = cast<CallInst>(CS.getInstruction());
+ CallInst *call =
+ Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token");
+ call->setTailCall(toReplace->isTailCall());
+ call->setCallingConv(toReplace->getCallingConv());
+
+ // Currently we will fail on parameter attributes and on certain
+ // function attributes.
+ AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes());
+ // In case if we can handle this set of sttributes - set up function attrs
+ // directly on statepoint and return attrs later for gc_result intrinsic.
+ call->setAttributes(new_attrs.getFnAttributes());
+ return_attributes = new_attrs.getRetAttributes();
+
+ token = call;
+
+ // Put the following gc_result and gc_relocate calls immediately after the
+ // the old call (which we're about to delete)
+ BasicBlock::iterator next(toReplace);
+ assert(BB->end() != next && "not a terminator, must have next");
+ next++;
+ Instruction *IP = &*(next);
+ Builder.SetInsertPoint(IP);
+ Builder.SetCurrentDebugLocation(IP->getDebugLoc());
+
+ } else {
+ InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction());
+
+ // Insert the new invoke into the old block. We'll remove the old one in a
+ // moment at which point this will become the new terminator for the
+ // original block.
+ InvokeInst *invoke = InvokeInst::Create(
+ gc_statepoint_decl, toReplace->getNormalDest(),
+ toReplace->getUnwindDest(), args, "", toReplace->getParent());
+ invoke->setCallingConv(toReplace->getCallingConv());
+
+ // Currently we will fail on parameter attributes and on certain
+ // function attributes.
+ AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes());
+ // In case if we can handle this set of sttributes - set up function attrs
+ // directly on statepoint and return attrs later for gc_result intrinsic.
+ invoke->setAttributes(new_attrs.getFnAttributes());
+ return_attributes = new_attrs.getRetAttributes();
+
+ token = invoke;
+
+ // Generate gc relocates in exceptional path
+ BasicBlock *unwindBlock = normalizeBBForInvokeSafepoint(
+ toReplace->getUnwindDest(), invoke->getParent(), P);
+
+ Instruction *IP = &*(unwindBlock->getFirstInsertionPt());
+ Builder.SetInsertPoint(IP);
+ Builder.SetCurrentDebugLocation(toReplace->getDebugLoc());
+
+ // Extract second element from landingpad return value. We will attach
+ // exceptional gc relocates to it.
+ const unsigned idx = 1;
+ Instruction *exceptional_token =
+ cast<Instruction>(Builder.CreateExtractValue(
+ unwindBlock->getLandingPadInst(), idx, "relocate_token"));
+ result.UnwindToken = exceptional_token;
+
+ // Just throw away return value. We will use the one we got for normal
+ // block.
+ (void)CreateGCRelocates(liveVariables, live_start, basePtrs,
+ exceptional_token, Builder);
+
+ // Generate gc relocates and returns for normal block
+ BasicBlock *normalDest = normalizeBBForInvokeSafepoint(
+ toReplace->getNormalDest(), invoke->getParent(), P);
+
+ IP = &*(normalDest->getFirstInsertionPt());
+ Builder.SetInsertPoint(IP);
+
+ // gc relocates will be generated later as if it were regular call
+ // statepoint
+ }
+ assert(token);
+
+ // Take the name of the original value call if it had one.
+ token->takeName(CS.getInstruction());
+
+ // The GCResult is already inserted, we just need to find it
+#ifndef NDEBUG
+ Instruction *toReplace = CS.getInstruction();
+ assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) &&
+ "only valid use before rewrite is gc.result");
+ assert(!toReplace->hasOneUse() ||
+ isGCResult(cast<Instruction>(*toReplace->user_begin())));
+#endif
+
+ // Update the gc.result of the original statepoint (if any) to use the newly
+ // inserted statepoint. This is safe to do here since the token can't be
+ // considered a live reference.
+ CS.getInstruction()->replaceAllUsesWith(token);
+
+ result.StatepointToken = token;
+
+ // Second, create a gc.relocate for every live variable
+ CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder);
+
+}
+
+namespace {
+struct name_ordering {
+ Value *base;
+ Value *derived;
+ bool operator()(name_ordering const &a, name_ordering const &b) {
+ return -1 == a.derived->getName().compare(b.derived->getName());
+ }
+};
+}
+static void stablize_order(SmallVectorImpl<Value *> &basevec,
+ SmallVectorImpl<Value *> &livevec) {
+ assert(basevec.size() == livevec.size());
+
+ SmallVector<name_ordering, 64> temp;
+ for (size_t i = 0; i < basevec.size(); i++) {
+ name_ordering v;
+ v.base = basevec[i];
+ v.derived = livevec[i];
+ temp.push_back(v);
+ }
+ std::sort(temp.begin(), temp.end(), name_ordering());
+ for (size_t i = 0; i < basevec.size(); i++) {
+ basevec[i] = temp[i].base;
+ livevec[i] = temp[i].derived;
+ }
+}
+
+// Replace an existing gc.statepoint with a new one and a set of gc.relocates
+// which make the relocations happening at this safepoint explicit.
+//
+// WARNING: Does not do any fixup to adjust users of the original live
+// values. That's the callers responsibility.
+static void
+makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P,
+ PartiallyConstructedSafepointRecord &result) {
+ auto liveset = result.liveset;
+ auto PointerToBase = result.PointerToBase;
+
+ // Convert to vector for efficient cross referencing.
+ SmallVector<Value *, 64> basevec, livevec;
+ livevec.reserve(liveset.size());
+ basevec.reserve(liveset.size());
+ for (Value *L : liveset) {
+ livevec.push_back(L);
+
+ assert(PointerToBase.find(L) != PointerToBase.end());
+ Value *base = PointerToBase[L];
+ basevec.push_back(base);
+ }
+ assert(livevec.size() == basevec.size());
+
+ // To make the output IR slightly more stable (for use in diffs), ensure a
+ // fixed order of the values in the safepoint (by sorting the value name).
+ // The order is otherwise meaningless.
+ stablize_order(basevec, livevec);
+
+ // Do the actual rewriting and delete the old statepoint
+ makeStatepointExplicitImpl(CS, basevec, livevec, P, result);
+ CS.getInstruction()->eraseFromParent();
+}
+
+// Helper function for the relocationViaAlloca.
+// It receives iterator to the statepoint gc relocates and emits store to the
+// assigned
+// location (via allocaMap) for the each one of them.
+// Add visited values into the visitedLiveValues set we will later use them
+// for sanity check.
+static void
+insertRelocationStores(iterator_range<Value::user_iterator> gcRelocs,
+ DenseMap<Value *, Value *> &allocaMap,
+ DenseSet<Value *> &visitedLiveValues) {
+
+ for (User *U : gcRelocs) {
+ if (!isa<IntrinsicInst>(U))
+ continue;
+
+ IntrinsicInst *relocatedValue = cast<IntrinsicInst>(U);
+
+ // We only care about relocates
+ if (relocatedValue->getIntrinsicID() !=
+ Intrinsic::experimental_gc_relocate) {
+ continue;
+ }
+
+ GCRelocateOperands relocateOperands(relocatedValue);
+ Value *originalValue = const_cast<Value *>(relocateOperands.derivedPtr());
+ assert(allocaMap.count(originalValue));
+ Value *alloca = allocaMap[originalValue];
+
+ // Emit store into the related alloca
+ StoreInst *store = new StoreInst(relocatedValue, alloca);
+ store->insertAfter(relocatedValue);
+
+#ifndef NDEBUG
+ visitedLiveValues.insert(originalValue);
+#endif
+ }
+}
+
+/// do all the relocation update via allocas and mem2reg
+static void relocationViaAlloca(
+ Function &F, DominatorTree &DT, ArrayRef<Value *> live,
+ ArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+#ifndef NDEBUG
+ int initialAllocaNum = 0;
+
+ // record initial number of allocas
+ for (inst_iterator itr = inst_begin(F), end = inst_end(F); itr != end;
+ itr++) {
+ if (isa<AllocaInst>(*itr))
+ initialAllocaNum++;
+ }
+#endif
+
+ // TODO-PERF: change data structures, reserve
+ DenseMap<Value *, Value *> allocaMap;
+ SmallVector<AllocaInst *, 200> PromotableAllocas;
+ PromotableAllocas.reserve(live.size());
+
+ // emit alloca for each live gc pointer
+ for (unsigned i = 0; i < live.size(); i++) {
+ Value *liveValue = live[i];
+ AllocaInst *alloca = new AllocaInst(liveValue->getType(), "",
+ F.getEntryBlock().getFirstNonPHI());
+ allocaMap[liveValue] = alloca;
+ PromotableAllocas.push_back(alloca);
+ }
+
+ // The next two loops are part of the same conceptual operation. We need to
+ // insert a store to the alloca after the original def and at each
+ // redefinition. We need to insert a load before each use. These are split
+ // into distinct loops for performance reasons.
+
+ // update gc pointer after each statepoint
+ // either store a relocated value or null (if no relocated value found for
+ // this gc pointer and it is not a gc_result)
+ // this must happen before we update the statepoint with load of alloca
+ // otherwise we lose the link between statepoint and old def
+ for (size_t i = 0; i < records.size(); i++) {
+ const struct PartiallyConstructedSafepointRecord &info = records[i];
+ Value *Statepoint = info.StatepointToken;
+
+ // This will be used for consistency check
+ DenseSet<Value *> visitedLiveValues;
+
+ // Insert stores for normal statepoint gc relocates
+ insertRelocationStores(Statepoint->users(), allocaMap, visitedLiveValues);
+
+ // In case if it was invoke statepoint
+ // we will insert stores for exceptional path gc relocates.
+ if (isa<InvokeInst>(Statepoint)) {
+ insertRelocationStores(info.UnwindToken->users(),
+ allocaMap, visitedLiveValues);
+ }
+
+#ifndef NDEBUG
+ // As a debuging aid, pretend that an unrelocated pointer becomes null at
+ // the gc.statepoint. This will turn some subtle GC problems into slightly
+ // easier to debug SEGVs
+ SmallVector<AllocaInst *, 64> ToClobber;
+ for (auto Pair : allocaMap) {
+ Value *Def = Pair.first;
+ AllocaInst *Alloca = cast<AllocaInst>(Pair.second);
+
+ // This value was relocated
+ if (visitedLiveValues.count(Def)) {
+ continue;
+ }
+ ToClobber.push_back(Alloca);
+ }
+
+ auto InsertClobbersAt = [&](Instruction *IP) {
+ for (auto *AI : ToClobber) {
+ auto AIType = cast<PointerType>(AI->getType());
+ auto PT = cast<PointerType>(AIType->getElementType());
+ Constant *CPN = ConstantPointerNull::get(PT);
+ StoreInst *store = new StoreInst(CPN, AI);
+ store->insertBefore(IP);
+ }
+ };
+
+ // Insert the clobbering stores. These may get intermixed with the
+ // gc.results and gc.relocates, but that's fine.
+ if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
+ InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt());
+ InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt());
+ } else {
+ BasicBlock::iterator Next(cast<CallInst>(Statepoint));
+ Next++;
+ InsertClobbersAt(Next);
+ }
+#endif
+ }
+ // update use with load allocas and add store for gc_relocated
+ for (auto Pair : allocaMap) {
+ Value *def = Pair.first;
+ Value *alloca = Pair.second;
+
+ // we pre-record the uses of allocas so that we dont have to worry about
+ // later update
+ // that change the user information.
+ SmallVector<Instruction *, 20> uses;
+ // PERF: trade a linear scan for repeated reallocation
+ uses.reserve(std::distance(def->user_begin(), def->user_end()));
+ for (User *U : def->users()) {
+ if (!isa<ConstantExpr>(U)) {
+ // If the def has a ConstantExpr use, then the def is either a
+ // ConstantExpr use itself or null. In either case
+ // (recursively in the first, directly in the second), the oop
+ // it is ultimately dependent on is null and this particular
+ // use does not need to be fixed up.
+ uses.push_back(cast<Instruction>(U));
+ }
+ }
+
+ std::sort(uses.begin(), uses.end());
+ auto last = std::unique(uses.begin(), uses.end());
+ uses.erase(last, uses.end());
+
+ for (Instruction *use : uses) {
+ if (isa<PHINode>(use)) {
+ PHINode *phi = cast<PHINode>(use);
+ for (unsigned i = 0; i < phi->getNumIncomingValues(); i++) {
+ if (def == phi->getIncomingValue(i)) {
+ LoadInst *load = new LoadInst(
+ alloca, "", phi->getIncomingBlock(i)->getTerminator());
+ phi->setIncomingValue(i, load);
+ }
+ }
+ } else {
+ LoadInst *load = new LoadInst(alloca, "", use);
+ use->replaceUsesOfWith(def, load);
+ }
+ }
+
+ // emit store for the initial gc value
+ // store must be inserted after load, otherwise store will be in alloca's
+ // use list and an extra load will be inserted before it
+ StoreInst *store = new StoreInst(def, alloca);
+ if (isa<Instruction>(def)) {
+ store->insertAfter(cast<Instruction>(def));
+ } else {
+ assert((isa<Argument>(def) || isa<GlobalVariable>(def) ||
+ (isa<Constant>(def) && cast<Constant>(def)->isNullValue())) &&
+ "Must be argument or global");
+ store->insertAfter(cast<Instruction>(alloca));
+ }
+ }
+
+ assert(PromotableAllocas.size() == live.size() &&
+ "we must have the same allocas with lives");
+ if (!PromotableAllocas.empty()) {
+ // apply mem2reg to promote alloca to SSA
+ PromoteMemToReg(PromotableAllocas, DT);
+ }
+
+#ifndef NDEBUG
+ for (inst_iterator itr = inst_begin(F), end = inst_end(F); itr != end;
+ itr++) {
+ if (isa<AllocaInst>(*itr))
+ initialAllocaNum--;
+ }
+ assert(initialAllocaNum == 0 && "We must not introduce any extra allocas");
+#endif
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector. Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
+ DenseSet<T> Seen;
+ SmallVector<T, 128> TempVec;
+ TempVec.reserve(Vec.size());
+ for (auto Element : Vec)
+ TempVec.push_back(Element);
+ Vec.clear();
+ for (auto V : TempVec) {
+ if (Seen.insert(V).second) {
+ Vec.push_back(V);
+ }
+ }
+}
+
+static Function *getUseHolder(Module &M) {
+ FunctionType *ftype =
+ FunctionType::get(Type::getVoidTy(M.getContext()), true);
+ Function *Func = cast<Function>(M.getOrInsertFunction("__tmp_use", ftype));
+ return Func;
+}
+
+/// Insert holders so that each Value is obviously live through the entire
+/// liftetime of the call.
+static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values,
+ SmallVectorImpl<CallInst *> &holders) {
+ Module *M = CS.getInstruction()->getParent()->getParent()->getParent();
+ Function *Func = getUseHolder(*M);
+ if (CS.isCall()) {
+ // For call safepoints insert dummy calls right after safepoint
+ BasicBlock::iterator next(CS.getInstruction());
+ next++;
+ CallInst *base_holder = CallInst::Create(Func, Values, "", next);
+ holders.push_back(base_holder);
+ } else if (CS.isInvoke()) {
+ // For invoke safepooints insert dummy calls both in normal and
+ // exceptional destination blocks
+ InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction());
+ CallInst *normal_holder = CallInst::Create(
+ Func, Values, "", invoke->getNormalDest()->getFirstInsertionPt());
+ CallInst *unwind_holder = CallInst::Create(
+ Func, Values, "", invoke->getUnwindDest()->getFirstInsertionPt());
+ holders.push_back(normal_holder);
+ holders.push_back(unwind_holder);
+ } else
+ llvm_unreachable("unsupported call type");
+}
+
+static void findLiveReferences(
+ Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate,
+ MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ const CallSite &CS = toUpdate[i];
+ analyzeParsePointLiveness(DT, CS, info);
+ }
+}
+
+static void addBasesAsLiveValues(StatepointLiveSetTy &liveset,
+ DenseMap<Value *, Value *> &PointerToBase) {
+ // Identify any base pointers which are used in this safepoint, but not
+ // themselves relocated. We need to relocate them so that later inserted
+ // safepoints can get the properly relocated base register.
+ DenseSet<Value *> missing;
+ for (Value *L : liveset) {
+ assert(PointerToBase.find(L) != PointerToBase.end());
+ Value *base = PointerToBase[L];
+ assert(base);
+ if (liveset.find(base) == liveset.end()) {
+ assert(PointerToBase.find(base) == PointerToBase.end());
+ // uniqued by set insert
+ missing.insert(base);
+ }
+ }
+
+ // Note that we want these at the end of the list, otherwise
+ // register placement gets screwed up once we lower to STATEPOINT
+ // instructions. This is an utter hack, but there doesn't seem to be a
+ // better one.
+ for (Value *base : missing) {
+ assert(base);
+ liveset.insert(base);
+ PointerToBase[base] = base;
+ }
+ assert(liveset.size() == PointerToBase.size());
+}
+
+static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,
+ SmallVectorImpl<CallSite> &toUpdate) {
+#ifndef NDEBUG
+ // sanity check the input
+ std::set<CallSite> uniqued;
+ uniqued.insert(toUpdate.begin(), toUpdate.end());
+ assert(uniqued.size() == toUpdate.size() && "no duplicates please!");
+
+ for (size_t i = 0; i < toUpdate.size(); i++) {
+ CallSite &CS = toUpdate[i];
+ assert(CS.getInstruction()->getParent()->getParent() == &F);
+ assert(isStatepoint(CS) && "expected to already be a deopt statepoint");
+ }
+#endif
+
+ // A list of dummy calls added to the IR to keep various values obviously
+ // live in the IR. We'll remove all of these when done.
+ SmallVector<CallInst *, 64> holders;
+
+ // Insert a dummy call with all of the arguments to the vm_state we'll need
+ // for the actual safepoint insertion. This ensures reference arguments in
+ // the deopt argument list are considered live through the safepoint (and
+ // thus makes sure they get relocated.)
+ for (size_t i = 0; i < toUpdate.size(); i++) {
+ CallSite &CS = toUpdate[i];
+ Statepoint StatepointCS(CS);
+
+ SmallVector<Value *, 64> DeoptValues;
+ for (Use &U : StatepointCS.vm_state_args()) {
+ Value *Arg = cast<Value>(&U);
+ if (isGCPointerType(Arg->getType()))
+ DeoptValues.push_back(Arg);
+ }
+ insertUseHolderAfter(CS, DeoptValues, holders);
+ }
+
+ SmallVector<struct PartiallyConstructedSafepointRecord, 64> records;
+ records.reserve(toUpdate.size());
+ for (size_t i = 0; i < toUpdate.size(); i++) {
+ struct PartiallyConstructedSafepointRecord info;
+ records.push_back(info);
+ }
+ assert(records.size() == toUpdate.size());
+
+ // A) Identify all gc pointers which are staticly live at the given call
+ // site.
+ findLiveReferences(F, DT, P, toUpdate, records);
+
+ // B) Find the base pointers for each live pointer
+ /* scope for caching */ {
+ // Cache the 'defining value' relation used in the computation and
+ // insertion of base phis and selects. This ensures that we don't insert
+ // large numbers of duplicate base_phis.
+ DefiningValueMapTy DVCache;
+
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ CallSite &CS = toUpdate[i];
+ findBasePointers(DT, DVCache, CS, info);
+ }
+ } // end of cache scope
+
+ // The base phi insertion logic (for any safepoint) may have inserted new
+ // instructions which are now live at some safepoint. The simplest such
+ // example is:
+ // loop:
+ // phi a <-- will be a new base_phi here
+ // safepoint 1 <-- that needs to be live here
+ // gep a + 1
+ // safepoint 2
+ // br loop
+ DenseSet<llvm::Value *> allInsertedDefs;
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ allInsertedDefs.insert(info.NewInsertedDefs.begin(),
+ info.NewInsertedDefs.end());
+ }
+
+ // We insert some dummy calls after each safepoint to definitely hold live
+ // the base pointers which were identified for that safepoint. We'll then
+ // ask liveness for _every_ base inserted to see what is now live. Then we
+ // remove the dummy calls.
+ holders.reserve(holders.size() + records.size());
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ CallSite &CS = toUpdate[i];
+
+ SmallVector<Value *, 128> Bases;
+ for (auto Pair : info.PointerToBase) {
+ Bases.push_back(Pair.second);
+ }
+ insertUseHolderAfter(CS, Bases, holders);
+ }
+
+ // Add the bases explicitly to the live vector set. This may result in a few
+ // extra relocations, but the base has to be available whenever a pointer
+ // derived from it is used. Thus, we need it to be part of the statepoint's
+ // gc arguments list. TODO: Introduce an explicit notion (in the following
+ // code) of the GC argument list as seperate from the live Values at a
+ // given statepoint.
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ addBasesAsLiveValues(info.liveset, info.PointerToBase);
+ }
+
+ // If we inserted any new values, we need to adjust our notion of what is
+ // live at a particular safepoint.
+ if (!allInsertedDefs.empty()) {
+ fixupLiveReferences(F, DT, P, allInsertedDefs, toUpdate, records);
+ }
+ if (PrintBasePointers) {
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ errs() << "Base Pairs: (w/Relocation)\n";
+ for (auto Pair : info.PointerToBase) {
+ errs() << " derived %" << Pair.first->getName() << " base %"
+ << Pair.second->getName() << "\n";
+ }
+ }
+ }
+ for (size_t i = 0; i < holders.size(); i++) {
+ holders[i]->eraseFromParent();
+ holders[i] = nullptr;
+ }
+ holders.clear();
+
+ // Now run through and replace the existing statepoints with new ones with
+ // the live variables listed. We do not yet update uses of the values being
+ // relocated. We have references to live variables that need to
+ // survive to the last iteration of this loop. (By construction, the
+ // previous statepoint can not be a live variable, thus we can and remove
+ // the old statepoint calls as we go.)
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ CallSite &CS = toUpdate[i];
+ makeStatepointExplicit(DT, CS, P, info);
+ }
+ toUpdate.clear(); // prevent accident use of invalid CallSites
+
+ // In case if we inserted relocates in a different basic block than the
+ // original safepoint (this can happen for invokes). We need to be sure that
+ // original values were not used in any of the phi nodes at the
+ // beginning of basic block containing them. Because we know that all such
+ // blocks will have single predecessor we can safely assume that all phi
+ // nodes have single entry (because of normalizeBBForInvokeSafepoint).
+ // Just remove them all here.
+ for (size_t i = 0; i < records.size(); i++) {
+ Instruction *I = records[i].StatepointToken;
+
+ if (InvokeInst *invoke = dyn_cast<InvokeInst>(I)) {
+ FoldSingleEntryPHINodes(invoke->getNormalDest());
+ assert(!isa<PHINode>(invoke->getNormalDest()->begin()));
+
+ FoldSingleEntryPHINodes(invoke->getUnwindDest());
+ assert(!isa<PHINode>(invoke->getUnwindDest()->begin()));
+ }
+ }
+
+ // Do all the fixups of the original live variables to their relocated selves
+ SmallVector<Value *, 128> live;
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ // We can't simply save the live set from the original insertion. One of
+ // the live values might be the result of a call which needs a safepoint.
+ // That Value* no longer exists and we need to use the new gc_result.
+ // Thankfully, the liveset is embedded in the statepoint (and updated), so
+ // we just grab that.
+ Statepoint statepoint(info.StatepointToken);
+ live.insert(live.end(), statepoint.gc_args_begin(),
+ statepoint.gc_args_end());
+ }
+ unique_unsorted(live);
+
+#ifndef NDEBUG
+ // sanity check
+ for (auto ptr : live) {
+ assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type");
+ }
+#endif
+
+ relocationViaAlloca(F, DT, live, records);
+ return !records.empty();
+}
+
+/// Returns true if this function should be rewritten by this pass. The main
+/// point of this function is as an extension point for custom logic.
+static bool shouldRewriteStatepointsIn(Function &F) {
+ // TODO: This should check the GCStrategy
+ if (F.hasGC()) {
+ const std::string StatepointExampleName("statepoint-example");
+ return StatepointExampleName == F.getGC();
+ } else
+ return false;
+}
+
+bool RewriteStatepointsForGC::runOnFunction(Function &F) {
+ // Nothing to do for declarations.
+ if (F.isDeclaration() || F.empty())
+ return false;
+
+ // Policy choice says not to rewrite - the most common reason is that we're
+ // compiling code without a GCStrategy.
+ if (!shouldRewriteStatepointsIn(F))
+ return false;
+
+ // Gather all the statepoints which need rewritten.
+ SmallVector<CallSite, 64> ParsePointNeeded;
+ for (Instruction &I : inst_range(F)) {
+ // TODO: only the ones with the flag set!
+ if (isStatepoint(I))
+ ParsePointNeeded.push_back(CallSite(&I));
+ }
+
+ // Return early if no work to do.
+ if (ParsePointNeeded.empty())
+ return false;
+
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return insertParsePoints(F, DT, this, ParsePointNeeded);
+}
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index cfc9a8e..05b9608 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -35,7 +35,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
@@ -1504,7 +1504,7 @@ namespace {
///
struct SCCP : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
static char ID; // Pass identification, replacement for typeid
SCCP() : FunctionPass(ID) {
@@ -1563,7 +1563,8 @@ bool SCCP::runOnFunction(Function &F) {
DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
- const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
SCCPSolver Solver(DL, TLI);
// Mark the first block of the function as being executable.
@@ -1637,7 +1638,7 @@ namespace {
///
struct IPSCCP : public ModulePass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfo>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
}
static char ID;
IPSCCP() : ModulePass(ID) {
@@ -1651,7 +1652,7 @@ char IPSCCP::ID = 0;
INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp",
"Interprocedural Sparse Conditional Constant Propagation",
false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(IPSCCP, "ipsccp",
"Interprocedural Sparse Conditional Constant Propagation",
false, false)
@@ -1692,7 +1693,8 @@ static bool AddressIsTaken(const GlobalValue *GV) {
bool IPSCCP::runOnModule(Module &M) {
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
- const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
SCCPSolver Solver(DL, TLI);
// AddressTakenFunctions - This set keeps track of the address-taken functions
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 6135114..f69c750 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -28,7 +28,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/PtrUseVisitor.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -79,8 +79,8 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates");
/// Hidden option to force the pass to not use DomTree and mem2reg, instead
/// forming SSA values through the SSAUpdater infrastructure.
-static cl::opt<bool>
-ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
+static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false),
+ cl::Hidden);
/// Hidden option to enable randomly shuffling the slices to help uncover
/// instability in their order.
@@ -89,15 +89,15 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
/// Hidden option to experiment with completely strict handling of inbounds
/// GEPs.
-static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds",
- cl::init(false), cl::Hidden);
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
+ cl::Hidden);
namespace {
/// \brief A custom IRBuilder inserter which prefixes all names if they are
/// preserved.
template <bool preserveNames = true>
-class IRBuilderPrefixedInserter :
- public IRBuilderDefaultInserter<preserveNames> {
+class IRBuilderPrefixedInserter
+ : public IRBuilderDefaultInserter<preserveNames> {
std::string Prefix;
public:
@@ -113,19 +113,19 @@ protected:
// Specialization for not preserving the name is trivial.
template <>
-class IRBuilderPrefixedInserter<false> :
- public IRBuilderDefaultInserter<false> {
+class IRBuilderPrefixedInserter<false>
+ : public IRBuilderDefaultInserter<false> {
public:
void SetNamePrefix(const Twine &P) {}
};
/// \brief Provide a typedef for IRBuilder that drops names in release builds.
#ifndef NDEBUG
-typedef llvm::IRBuilder<true, ConstantFolder,
- IRBuilderPrefixedInserter<true> > IRBuilderTy;
+typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>>
+ IRBuilderTy;
#else
-typedef llvm::IRBuilder<false, ConstantFolder,
- IRBuilderPrefixedInserter<false> > IRBuilderTy;
+typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>>
+ IRBuilderTy;
#endif
}
@@ -171,10 +171,14 @@ public:
/// decreasing. Thus the spanning range comes first in a cluster with the
/// same start position.
bool operator<(const Slice &RHS) const {
- if (beginOffset() < RHS.beginOffset()) return true;
- if (beginOffset() > RHS.beginOffset()) return false;
- if (isSplittable() != RHS.isSplittable()) return !isSplittable();
- if (endOffset() > RHS.endOffset()) return true;
+ if (beginOffset() < RHS.beginOffset())
+ return true;
+ if (beginOffset() > RHS.beginOffset())
+ return false;
+ if (isSplittable() != RHS.isSplittable())
+ return !isSplittable();
+ if (endOffset() > RHS.endOffset())
+ return true;
return false;
}
@@ -198,9 +202,7 @@ public:
namespace llvm {
template <typename T> struct isPodLike;
-template <> struct isPodLike<Slice> {
- static const bool value = true;
-};
+template <> struct isPodLike<Slice> { static const bool value = true; };
}
namespace {
@@ -235,6 +237,298 @@ public:
const_iterator end() const { return Slices.end(); }
/// @}
+ /// \brief Erase a range of slices.
+ void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
+
+ /// \brief Insert new slices for this alloca.
+ ///
+ /// This moves the slices into the alloca's slices collection, and re-sorts
+ /// everything so that the usual ordering properties of the alloca's slices
+ /// hold.
+ void insert(ArrayRef<Slice> NewSlices) {
+ int OldSize = Slices.size();
+ std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices));
+ auto SliceI = Slices.begin() + OldSize;
+ std::sort(SliceI, Slices.end());
+ std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+ }
+
+ // Forward declare an iterator to befriend it.
+ class partition_iterator;
+
+ /// \brief A partition of the slices.
+ ///
+ /// An ephemeral representation for a range of slices which can be viewed as
+ /// a partition of the alloca. This range represents a span of the alloca's
+ /// memory which cannot be split, and provides access to all of the slices
+ /// overlapping some part of the partition.
+ ///
+ /// Objects of this type are produced by traversing the alloca's slices, but
+ /// are only ephemeral and not persistent.
+ class Partition {
+ private:
+ friend class AllocaSlices;
+ friend class AllocaSlices::partition_iterator;
+
+ /// \brief The begining and ending offsets of the alloca for this partition.
+ uint64_t BeginOffset, EndOffset;
+
+ /// \brief The start end end iterators of this partition.
+ iterator SI, SJ;
+
+ /// \brief A collection of split slice tails overlapping the partition.
+ SmallVector<Slice *, 4> SplitTails;
+
+ /// \brief Raw constructor builds an empty partition starting and ending at
+ /// the given iterator.
+ Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+ public:
+ /// \brief The start offset of this partition.
+ ///
+ /// All of the contained slices start at or after this offset.
+ uint64_t beginOffset() const { return BeginOffset; }
+
+ /// \brief The end offset of this partition.
+ ///
+ /// All of the contained slices end at or before this offset.
+ uint64_t endOffset() const { return EndOffset; }
+
+ /// \brief The size of the partition.
+ ///
+ /// Note that this can never be zero.
+ uint64_t size() const {
+ assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+ return EndOffset - BeginOffset;
+ }
+
+ /// \brief Test whether this partition contains no slices, and merely spans
+ /// a region occupied by split slices.
+ bool empty() const { return SI == SJ; }
+
+ /// \name Iterate slices that start within the partition.
+ /// These may be splittable or unsplittable. They have a begin offset >= the
+ /// partition begin offset.
+ /// @{
+ // FIXME: We should probably define a "concat_iterator" helper and use that
+ // to stitch together pointee_iterators over the split tails and the
+ // contiguous iterators of the partition. That would give a much nicer
+ // interface here. We could then additionally expose filtered iterators for
+ // split, unsplit, and unsplittable splices based on the usage patterns.
+ iterator begin() const { return SI; }
+ iterator end() const { return SJ; }
+ /// @}
+
+ /// \brief Get the sequence of split slice tails.
+ ///
+ /// These tails are of slices which start before this partition but are
+ /// split and overlap into the partition. We accumulate these while forming
+ /// partitions.
+ ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+ };
+
+ /// \brief An iterator over partitions of the alloca's slices.
+ ///
+ /// This iterator implements the core algorithm for partitioning the alloca's
+ /// slices. It is a forward iterator as we don't support backtracking for
+ /// efficiency reasons, and re-use a single storage area to maintain the
+ /// current set of split slices.
+ ///
+ /// It is templated on the slice iterator type to use so that it can operate
+ /// with either const or non-const slice iterators.
+ class partition_iterator
+ : public iterator_facade_base<partition_iterator,
+ std::forward_iterator_tag, Partition> {
+ friend class AllocaSlices;
+
+ /// \brief Most of the state for walking the partitions is held in a class
+ /// with a nice interface for examining them.
+ Partition P;
+
+ /// \brief We need to keep the end of the slices to know when to stop.
+ AllocaSlices::iterator SE;
+
+ /// \brief We also need to keep track of the maximum split end offset seen.
+ /// FIXME: Do we really?
+ uint64_t MaxSplitSliceEndOffset;
+
+ /// \brief Sets the partition to be empty at given iterator, and sets the
+ /// end iterator.
+ partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+ : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+ // If not already at the end, advance our state to form the initial
+ // partition.
+ if (SI != SE)
+ advance();
+ }
+
+ /// \brief Advance the iterator to the next partition.
+ ///
+ /// Requires that the iterator not be at the end of the slices.
+ void advance() {
+ assert((P.SI != SE || !P.SplitTails.empty()) &&
+ "Cannot advance past the end of the slices!");
+
+ // Clear out any split uses which have ended.
+ if (!P.SplitTails.empty()) {
+ if (P.EndOffset >= MaxSplitSliceEndOffset) {
+ // If we've finished all splits, this is easy.
+ P.SplitTails.clear();
+ MaxSplitSliceEndOffset = 0;
+ } else {
+ // Remove the uses which have ended in the prior partition. This
+ // cannot change the max split slice end because we just checked that
+ // the prior partition ended prior to that max.
+ P.SplitTails.erase(
+ std::remove_if(
+ P.SplitTails.begin(), P.SplitTails.end(),
+ [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
+ P.SplitTails.end());
+ assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
+ [&](Slice *S) {
+ return S->endOffset() == MaxSplitSliceEndOffset;
+ }) &&
+ "Could not find the current max split slice offset!");
+ assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
+ [&](Slice *S) {
+ return S->endOffset() <= MaxSplitSliceEndOffset;
+ }) &&
+ "Max split slice end offset is not actually the max!");
+ }
+ }
+
+ // If P.SI is already at the end, then we've cleared the split tail and
+ // now have an end iterator.
+ if (P.SI == SE) {
+ assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+ return;
+ }
+
+ // If we had a non-empty partition previously, set up the state for
+ // subsequent partitions.
+ if (P.SI != P.SJ) {
+ // Accumulate all the splittable slices which started in the old
+ // partition into the split list.
+ for (Slice &S : P)
+ if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+ P.SplitTails.push_back(&S);
+ MaxSplitSliceEndOffset =
+ std::max(S.endOffset(), MaxSplitSliceEndOffset);
+ }
+
+ // Start from the end of the previous partition.
+ P.SI = P.SJ;
+
+ // If P.SI is now at the end, we at most have a tail of split slices.
+ if (P.SI == SE) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = MaxSplitSliceEndOffset;
+ return;
+ }
+
+ // If the we have split slices and the next slice is after a gap and is
+ // not splittable immediately form an empty partition for the split
+ // slices up until the next slice begins.
+ if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+ !P.SI->isSplittable()) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = P.SI->beginOffset();
+ return;
+ }
+ }
+
+ // OK, we need to consume new slices. Set the end offset based on the
+ // current slice, and step SJ past it. The beginning offset of the
+ // parttion is the beginning offset of the next slice unless we have
+ // pre-existing split slices that are continuing, in which case we begin
+ // at the prior end offset.
+ P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+ P.EndOffset = P.SI->endOffset();
+ ++P.SJ;
+
+ // There are two strategies to form a partition based on whether the
+ // partition starts with an unsplittable slice or a splittable slice.
+ if (!P.SI->isSplittable()) {
+ // When we're forming an unsplittable region, it must always start at
+ // the first slice and will extend through its end.
+ assert(P.BeginOffset == P.SI->beginOffset());
+
+ // Form a partition including all of the overlapping slices with this
+ // unsplittable slice.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ if (!P.SJ->isSplittable())
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // We have a partition across a set of overlapping unsplittable
+ // partitions.
+ return;
+ }
+
+ // If we're starting with a splittable slice, then we need to form
+ // a synthetic partition spanning it and any other overlapping splittable
+ // splices.
+ assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+ // Collect all of the overlapping splittable slices.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+ P.SJ->isSplittable()) {
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // Back upiP.EndOffset if we ended the span early when encountering an
+ // unsplittable slice. This synthesizes the early end offset of
+ // a partition spanning only splittable slices.
+ if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ assert(!P.SJ->isSplittable());
+ P.EndOffset = P.SJ->beginOffset();
+ }
+ }
+
+ public:
+ bool operator==(const partition_iterator &RHS) const {
+ assert(SE == RHS.SE &&
+ "End iterators don't match between compared partition iterators!");
+
+ // The observed positions of partitions is marked by the P.SI iterator and
+ // the emptyness of the split slices. The latter is only relevant when
+ // P.SI == SE, as the end iterator will additionally have an empty split
+ // slices list, but the prior may have the same P.SI and a tail of split
+ // slices.
+ if (P.SI == RHS.P.SI &&
+ P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+ assert(P.SJ == RHS.P.SJ &&
+ "Same set of slices formed two different sized partitions!");
+ assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+ "Same slice position with differently sized non-empty split "
+ "slice tails!");
+ return true;
+ }
+ return false;
+ }
+
+ partition_iterator &operator++() {
+ advance();
+ return *this;
+ }
+
+ Partition &operator*() { return P; }
+ };
+
+ /// \brief A forward range over the partitions of the alloca's slices.
+ ///
+ /// This accesses an iterator range over the partitions of the alloca's
+ /// slices. It computes these partitions on the fly based on the overlapping
+ /// offsets of the slices and the ability to split them. It will visit "empty"
+ /// partitions to cover regions of the alloca only accessed via split
+ /// slices.
+ iterator_range<partition_iterator> partitions() {
+ return make_range(partition_iterator(begin(), end()),
+ partition_iterator(end(), end()));
+ }
+
/// \brief Access the dead users for this alloca.
ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
@@ -308,7 +602,7 @@ static Value *foldSelectInst(SelectInst &SI) {
// being selected between, fold the select. Yes this does (rarely) happen
// early on.
if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
- return SI.getOperand(1+CI->isZero());
+ return SI.getOperand(1 + CI->isZero());
if (SI.getOperand(1) == SI.getOperand(2))
return SI.getOperand(1);
@@ -421,7 +715,8 @@ private:
GEPOffset +=
APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
} else {
- // For array or vector indices, scale the index by the size of the type.
+ // For array or vector indices, scale the index by the size of the
+ // type.
APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
GEPOffset += Index * APInt(Offset.getBitWidth(),
DL.getTypeAllocSize(GTI.getIndexedType()));
@@ -440,16 +735,10 @@ private:
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
uint64_t Size, bool IsVolatile) {
- // We allow splitting of loads and stores where the type is an integer type
- // and cover the entire alloca. This prevents us from splitting over
- // eagerly.
- // FIXME: In the great blue eventually, we should eagerly split all integer
- // loads and stores, and then have a separate step that merges adjacent
- // alloca partitions into a single partition suitable for integer widening.
- // Or we should skip the merge step and rely on GVN and other passes to
- // merge adjacent loads and stores that survive mem2reg.
- bool IsSplittable =
- Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
+ // We allow splitting of non-volatile loads and stores where the type is an
+ // integer type. These may be used to implement 'memcpy' or other "transfer
+ // of bits" patterns.
+ bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
insertUse(I, Offset, Size, IsSplittable);
}
@@ -495,7 +784,6 @@ private:
handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
}
-
void visitMemSetInst(MemSetInst &II) {
assert(II.getRawDest() == *U && "Pointer use is not the destination?");
ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
@@ -507,9 +795,8 @@ private:
if (!IsOffsetKnown)
return PI.setAborted(&II);
- insertUse(II, Offset,
- Length ? Length->getLimitedValue()
- : AllocSize - Offset.getLimitedValue(),
+ insertUse(II, Offset, Length ? Length->getLimitedValue()
+ : AllocSize - Offset.getLimitedValue(),
(bool)Length);
}
@@ -533,15 +820,15 @@ private:
// FIXME: Yet another place we really should bypass this when
// instrumenting for ASan.
if (Offset.uge(AllocSize)) {
- SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II);
+ SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
+ MemTransferSliceMap.find(&II);
if (MTPI != MemTransferSliceMap.end())
AS.Slices[MTPI->second].kill();
return markAsDead(II);
}
uint64_t RawOffset = Offset.getLimitedValue();
- uint64_t Size = Length ? Length->getLimitedValue()
- : AllocSize - RawOffset;
+ uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
// Check for the special case where the same exact value is used for both
// source and dest.
@@ -697,18 +984,12 @@ private:
insertUse(I, Offset, Size);
}
- void visitPHINode(PHINode &PN) {
- visitPHINodeOrSelectInst(PN);
- }
+ void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
- void visitSelectInst(SelectInst &SI) {
- visitPHINodeOrSelectInst(SI);
- }
+ void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
/// \brief Disable SROA entirely if there are unhandled users of the alloca.
- void visitInstruction(Instruction &I) {
- PI.setAborted(&I);
- }
+ void visitInstruction(Instruction &I) { PI.setAborted(&I); }
};
AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
@@ -729,7 +1010,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
}
Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
- std::mem_fun_ref(&Slice::isDead)),
+ [](const Slice &S) {
+ return S.isDead();
+ }),
Slices.end());
#if __cplusplus >= 201103L && !defined(NDEBUG)
@@ -749,6 +1032,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
void AllocaSlices::print(raw_ostream &OS, const_iterator I,
StringRef Indent) const {
printSlice(OS, I, Indent);
+ OS << "\n";
printUse(OS, I, Indent);
}
@@ -756,7 +1040,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
StringRef Indent) const {
OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
<< " slice #" << (I - begin())
- << (I->isSplittable() ? " (splittable)" : "") << "\n";
+ << (I->isSplittable() ? " (splittable)" : "");
}
void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
@@ -804,15 +1088,17 @@ public:
AllocaInst &AI, DIBuilder &DIB)
: LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
- void run(const SmallVectorImpl<Instruction*> &Insts) {
+ void run(const SmallVectorImpl<Instruction *> &Insts) {
// Retain the debug information attached to the alloca for use when
// rewriting loads and stores.
- if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
- for (User *U : DebugNode->users())
- if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
- DDIs.push_back(DDI);
- else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
- DVIs.push_back(DVI);
+ if (auto *L = LocalAsMetadata::getIfExists(&AI)) {
+ if (auto *DebugNode = MetadataAsValue::getIfExists(AI.getContext(), L)) {
+ for (User *U : DebugNode->users())
+ if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+ DDIs.push_back(DDI);
+ else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+ DVIs.push_back(DVI);
+ }
}
LoadAndStorePromoter::run(Insts);
@@ -825,8 +1111,9 @@ public:
DVIs.pop_back_val()->eraseFromParent();
}
- bool isInstInList(Instruction *I,
- const SmallVectorImpl<Instruction*> &Insts) const override {
+ bool
+ isInstInList(Instruction *I,
+ const SmallVectorImpl<Instruction *> &Insts) const override {
Value *Ptr;
if (LoadInst *LI = dyn_cast<LoadInst>(I))
Ptr = LI->getOperand(0);
@@ -884,7 +1171,6 @@ public:
};
} // end anon namespace
-
namespace {
/// \brief An optimization pass providing Scalar Replacement of Aggregates.
///
@@ -910,7 +1196,7 @@ class SROA : public FunctionPass {
LLVMContext *C;
const DataLayout *DL;
DominatorTree *DT;
- AssumptionTracker *AT;
+ AssumptionCache *AC;
/// \brief Worklist of alloca instructions to simplify.
///
@@ -919,12 +1205,12 @@ class SROA : public FunctionPass {
/// directly promoted. Finally, each time we rewrite a use of an alloca other
/// the one being actively rewritten, we add it back onto the list if not
/// already present to ensure it is re-visited.
- SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist;
+ SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist;
/// \brief A collection of instructions to delete.
/// We try to batch deletions to simplify code and make things a bit more
/// efficient.
- SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts;
+ SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts;
/// \brief Post-promotion worklist.
///
@@ -934,7 +1220,7 @@ class SROA : public FunctionPass {
///
/// Note that we have to be very careful to clear allocas out of this list in
/// the event they are deleted.
- SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist;
+ SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist;
/// \brief A collection of alloca instructions we can directly promote.
std::vector<AllocaInst *> PromotableAllocas;
@@ -944,7 +1230,7 @@ class SROA : public FunctionPass {
/// All of these PHIs have been checked for the safety of speculation and by
/// being speculated will allow promoting allocas currently in the promotable
/// queue.
- SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs;
+ SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs;
/// \brief A worklist of select instructions to speculate prior to promoting
/// allocas.
@@ -952,12 +1238,12 @@ class SROA : public FunctionPass {
/// All of these select instructions have been checked for the safety of
/// speculation and by being speculated will allow promoting allocas
/// currently in the promotable queue.
- SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects;
+ SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects;
public:
SROA(bool RequiresDomTree = true)
- : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
- C(nullptr), DL(nullptr), DT(nullptr) {
+ : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr),
+ DL(nullptr), DT(nullptr) {
initializeSROAPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
@@ -970,10 +1256,9 @@ private:
friend class PHIOrSelectSpeculator;
friend class AllocaSliceRewriter;
- bool rewritePartition(AllocaInst &AI, AllocaSlices &AS,
- AllocaSlices::iterator B, AllocaSlices::iterator E,
- int64_t BeginOffset, int64_t EndOffset,
- ArrayRef<AllocaSlices::iterator> SplitUses);
+ bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
+ AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+ AllocaSlices::Partition &P);
bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
bool runOnAlloca(AllocaInst &AI);
void clobberUse(Use &U);
@@ -988,12 +1273,12 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
return new SROA(RequiresDomTree);
}
-INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
- false, false)
+INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
+ false)
/// Walk the range of a partitioning looking for a common type to cover this
/// sequence of slices.
@@ -1064,8 +1349,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
///
/// FIXME: This should be hoisted into a generic utility, likely in
/// Transforms/Util/Local.h
-static bool isSafePHIToSpeculate(PHINode &PN,
- const DataLayout *DL = nullptr) {
+static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) {
// For now, we can only do this promotion if the load is in the same block
// as the PHI, and if there are no stores between the phi and load.
// TODO: Allow recursive phi users.
@@ -1325,7 +1609,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
SmallVectorImpl<Value *> &Indices,
Twine NamePrefix) {
if (Offset == 0)
- return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix);
+ return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
+ NamePrefix);
// We can't recurse through pointer types.
if (Ty->isPointerTy())
@@ -1433,8 +1718,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
/// a single GEP as possible, thus making each GEP more independent of the
/// surrounding code.
static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
- APInt Offset, Type *PointerTy,
- Twine NamePrefix) {
+ APInt Offset, Type *PointerTy, Twine NamePrefix) {
// Even though we don't look through PHI nodes, we could be called on an
// instruction in an unreachable block, which may be on a cycle.
SmallPtrSet<Value *, 4> Visited;
@@ -1443,8 +1727,9 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
// We may end up computing an offset pointer that has the wrong type. If we
// never are able to compute one directly that has the correct type, we'll
- // fall back to it, so keep it around here.
+ // fall back to it, so keep it and the base it was computed from around here.
Value *OffsetPtr = nullptr;
+ Value *OffsetBasePtr;
// Remember any i8 pointer we come across to re-use if we need to do a raw
// byte offset.
@@ -1469,16 +1754,19 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
Indices.clear();
if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
Indices, NamePrefix)) {
- if (P->getType() == PointerTy) {
- // Zap any offset pointer that we ended up computing in previous rounds.
- if (OffsetPtr && OffsetPtr->use_empty())
- if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
- I->eraseFromParent();
+ // If we have a new natural pointer at the offset, clear out any old
+ // offset pointer we computed. Unless it is the base pointer or
+ // a non-instruction, we built a GEP we don't need. Zap it.
+ if (OffsetPtr && OffsetPtr != OffsetBasePtr)
+ if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
+ assert(I->use_empty() && "Built a GEP with uses some how!");
+ I->eraseFromParent();
+ }
+ OffsetPtr = P;
+ OffsetBasePtr = Ptr;
+ // If we also found a pointer of the right type, we're done.
+ if (P->getType() == PointerTy)
return P;
- }
- if (!OffsetPtr) {
- OffsetPtr = P;
- }
}
// Stash this pointer if we've found an i8*.
@@ -1508,9 +1796,10 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
Int8PtrOffset = Offset;
}
- OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
- IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
- NamePrefix + "sroa_raw_idx");
+ OffsetPtr = Int8PtrOffset == 0
+ ? Int8Ptr
+ : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
+ NamePrefix + "sroa_raw_idx");
}
Ptr = OffsetPtr;
@@ -1521,6 +1810,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
return Ptr;
}
+/// \brief Compute the adjusted alignment for a load or store from an offset.
+static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
+ const DataLayout &DL) {
+ unsigned Alignment;
+ Type *Ty;
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ Alignment = LI->getAlignment();
+ Ty = LI->getType();
+ } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+ Alignment = SI->getAlignment();
+ Ty = SI->getValueOperand()->getType();
+ } else {
+ llvm_unreachable("Only loads and stores are allowed!");
+ }
+
+ if (!Alignment)
+ Alignment = DL.getABITypeAlignment(Ty);
+
+ return MinAlign(Alignment, Offset);
+}
+
/// \brief Test whether we can convert a value from the old to the new type.
///
/// This predicate should be used to guard calls to convertValue in order to
@@ -1614,19 +1924,19 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
///
/// This function is called to test each entry in a partioning which is slated
/// for a single slice.
-static bool
-isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset,
- uint64_t SliceEndOffset, VectorType *Ty,
- uint64_t ElementSize, const Slice &S) {
+static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P,
+ const Slice &S, VectorType *Ty,
+ uint64_t ElementSize,
+ const DataLayout &DL) {
// First validate the slice offsets.
uint64_t BeginOffset =
- std::max(S.beginOffset(), SliceBeginOffset) - SliceBeginOffset;
+ std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
uint64_t BeginIndex = BeginOffset / ElementSize;
if (BeginIndex * ElementSize != BeginOffset ||
BeginIndex >= Ty->getNumElements())
return false;
uint64_t EndOffset =
- std::min(S.endOffset(), SliceEndOffset) - SliceBeginOffset;
+ std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
uint64_t EndIndex = EndOffset / ElementSize;
if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
return false;
@@ -1658,7 +1968,7 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset,
if (LI->isVolatile())
return false;
Type *LTy = LI->getType();
- if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) {
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
assert(LTy->isIntegerTy());
LTy = SplitIntTy;
}
@@ -1668,7 +1978,7 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset,
if (SI->isVolatile())
return false;
Type *STy = SI->getValueOperand()->getType();
- if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) {
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
assert(STy->isIntegerTy());
STy = SplitIntTy;
}
@@ -1690,11 +2000,8 @@ isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset,
/// SSA value. We only can ensure this for a limited set of operations, and we
/// don't want to do the rewrites unless we are confident that the result will
/// be promotable, so we have an early test here.
-static VectorType *
-isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy,
- uint64_t SliceBeginOffset, uint64_t SliceEndOffset,
- AllocaSlices::const_range Slices,
- ArrayRef<AllocaSlices::iterator> SplitUses) {
+static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,
+ const DataLayout &DL) {
// Collect the candidate types for vector-based promotion. Also track whether
// we have different element types.
SmallVector<VectorType *, 4> CandidateTys;
@@ -1709,11 +2016,10 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy,
HaveCommonEltTy = false;
}
};
- CheckCandidateType(AllocaTy);
// Consider any loads or stores that are the exact size of the slice.
- for (const auto &S : Slices)
- if (S.beginOffset() == SliceBeginOffset &&
- S.endOffset() == SliceEndOffset) {
+ for (const Slice &S : P)
+ if (S.beginOffset() == P.beginOffset() &&
+ S.endOffset() == P.endOffset()) {
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
CheckCandidateType(LI->getType());
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
@@ -1780,14 +2086,12 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy,
"vector size not a multiple of element size?");
ElementSize /= 8;
- for (const auto &S : Slices)
- if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset,
- VTy, ElementSize, S))
+ for (const Slice &S : P)
+ if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
return false;
- for (const auto &SI : SplitUses)
- if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset,
- VTy, ElementSize, *SI))
+ for (const Slice *S : P.splitSliceTails())
+ if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
return false;
return true;
@@ -1803,12 +2107,13 @@ isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy,
///
/// This implements the necessary checking for the \c isIntegerWideningViable
/// test below on a single slice of the alloca.
-static bool isIntegerWideningViableForSlice(const DataLayout &DL,
- Type *AllocaTy,
+static bool isIntegerWideningViableForSlice(const Slice &S,
uint64_t AllocBeginOffset,
- uint64_t Size,
- const Slice &S,
+ Type *AllocaTy,
+ const DataLayout &DL,
bool &WholeAllocaOp) {
+ uint64_t Size = DL.getTypeStoreSize(AllocaTy);
+
uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
@@ -1876,11 +2181,8 @@ static bool isIntegerWideningViableForSlice(const DataLayout &DL,
/// This is a quick test to check whether we can rewrite the integer loads and
/// stores to a particular alloca into wider loads and stores and be able to
/// promote the resulting alloca.
-static bool
-isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy,
- uint64_t AllocBeginOffset,
- AllocaSlices::const_range Slices,
- ArrayRef<AllocaSlices::iterator> SplitUses) {
+static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy,
+ const DataLayout &DL) {
uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
// Don't create integer types larger than the maximum bitwidth.
if (SizeInBits > IntegerType::MAX_INT_BITS)
@@ -1898,24 +2200,24 @@ isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy,
!canConvertValue(DL, IntTy, AllocaTy))
return false;
- uint64_t Size = DL.getTypeStoreSize(AllocaTy);
-
// While examining uses, we ensure that the alloca has a covering load or
// store. We don't want to widen the integer operations only to fail to
// promote due to some other unsplittable entry (which we may make splittable
// later). However, if there are only splittable uses, go ahead and assume
// that we cover the alloca.
+ // FIXME: We shouldn't consider split slices that happen to start in the
+ // partition here...
bool WholeAllocaOp =
- Slices.begin() != Slices.end() ? false : DL.isLegalInteger(SizeInBits);
+ P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
- for (const auto &S : Slices)
- if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
- S, WholeAllocaOp))
+ for (const Slice &S : P)
+ if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
return false;
- for (const auto &SI : SplitUses)
- if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
- *SI, WholeAllocaOp))
+ for (const Slice *S : P.splitSliceTails())
+ if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
return false;
return WholeAllocaOp;
@@ -1928,9 +2230,9 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
IntegerType *IntTy = cast<IntegerType>(V->getType());
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
"Element extends past full value");
- uint64_t ShAmt = 8*Offset;
+ uint64_t ShAmt = 8 * Offset;
if (DL.isBigEndian())
- ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
if (ShAmt) {
V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
DEBUG(dbgs() << " shifted: " << *V << "\n");
@@ -1957,9 +2259,9 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
}
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
"Element store outside of alloca store");
- uint64_t ShAmt = 8*Offset;
+ uint64_t ShAmt = 8 * Offset;
if (DL.isBigEndian())
- ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
if (ShAmt) {
V = IRB.CreateShl(V, ShAmt, Name + ".shift");
DEBUG(dbgs() << " shifted: " << *V << "\n");
@@ -1975,9 +2277,8 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
return V;
}
-static Value *extractVector(IRBuilderTy &IRB, Value *V,
- unsigned BeginIndex, unsigned EndIndex,
- const Twine &Name) {
+static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
+ unsigned EndIndex, const Twine &Name) {
VectorType *VecTy = cast<VectorType>(V->getType());
unsigned NumElements = EndIndex - BeginIndex;
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
@@ -1992,13 +2293,12 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V,
return V;
}
- SmallVector<Constant*, 8> Mask;
+ SmallVector<Constant *, 8> Mask;
Mask.reserve(NumElements);
for (unsigned i = BeginIndex; i != EndIndex; ++i)
Mask.push_back(IRB.getInt32(i));
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
- ConstantVector::get(Mask),
- Name + ".extract");
+ ConstantVector::get(Mask), Name + ".extract");
DEBUG(dbgs() << " shuffle: " << *V << "\n");
return V;
}
@@ -2013,7 +2313,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
// Single element to insert.
V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
Name + ".insert");
- DEBUG(dbgs() << " insert: " << *V << "\n");
+ DEBUG(dbgs() << " insert: " << *V << "\n");
return V;
}
@@ -2029,7 +2329,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
// use a shuffle vector to widen it with undef elements, and then
// a second shuffle vector to select between the loaded vector and the
// incoming vector.
- SmallVector<Constant*, 8> Mask;
+ SmallVector<Constant *, 8> Mask;
Mask.reserve(VecTy->getNumElements());
for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
if (i >= BeginIndex && i < EndIndex)
@@ -2037,8 +2337,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
else
Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
- ConstantVector::get(Mask),
- Name + ".expand");
+ ConstantVector::get(Mask), Name + ".expand");
DEBUG(dbgs() << " shuffle: " << *V << "\n");
Mask.clear();
@@ -2148,6 +2447,9 @@ public:
IsSplittable = I->isSplittable();
IsSplit =
BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+ DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
+ DEBUG(AS.printSlice(dbgs(), I, ""));
+ DEBUG(dbgs() << "\n");
// Compute the intersecting offset range.
assert(BeginOffset < NewAllocaEndOffset);
@@ -2218,7 +2520,8 @@ private:
);
}
- /// \brief Compute suitable alignment to access this slice of the *new* alloca.
+ /// \brief Compute suitable alignment to access this slice of the *new*
+ /// alloca.
///
/// You can optionally pass a type to this routine and if that type's ABI
/// alignment is itself suitable, this will return zero.
@@ -2226,7 +2529,8 @@ private:
unsigned NewAIAlign = NewAI.getAlignment();
if (!NewAIAlign)
NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
- unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+ unsigned Align =
+ MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
}
@@ -2250,16 +2554,14 @@ private:
unsigned EndIndex = getIndex(NewEndOffset);
assert(EndIndex > BeginIndex && "Empty vector!");
- Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
}
Value *rewriteIntegerLoad(LoadInst &LI) {
assert(IntTy && "We cannot insert an integer to the alloca");
assert(!LI.isVolatile());
- Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
V = convertValue(DL, IRB, V, IntTy);
assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
@@ -2284,8 +2586,8 @@ private:
V = rewriteIntegerLoad(LI);
} else if (NewBeginOffset == NewAllocaBeginOffset &&
canConvertValue(DL, NewAllocaTy, LI.getType())) {
- V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- LI.isVolatile(), LI.getName());
+ V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(),
+ LI.getName());
} else {
Type *LTy = TargetTy->getPointerTo();
V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
@@ -2302,7 +2604,7 @@ private:
assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
"Split load isn't smaller than original load");
assert(LI.getType()->getIntegerBitWidth() ==
- DL.getTypeStoreSizeInBits(LI.getType()) &&
+ DL.getTypeStoreSizeInBits(LI.getType()) &&
"Non-byte-multiple bit width");
// Move the insertion point just past the load so that we can refer to it.
IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI)));
@@ -2310,9 +2612,9 @@ private:
// basis for the new value. This allows us to replace the uses of LI with
// the computed value, and then replace the placeholder with LI, leaving
// LI only used for this computation.
- Value *Placeholder
- = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
- V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset,
+ Value *Placeholder =
+ new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+ V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
"insert");
LI.replaceAllUsesWith(V);
Placeholder->replaceAllUsesWith(&LI);
@@ -2334,15 +2636,14 @@ private:
assert(EndIndex > BeginIndex && "Empty vector!");
unsigned NumElements = EndIndex - BeginIndex;
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
- Type *SliceTy =
- (NumElements == 1) ? ElementTy
- : VectorType::get(ElementTy, NumElements);
+ Type *SliceTy = (NumElements == 1)
+ ? ElementTy
+ : VectorType::get(ElementTy, NumElements);
if (V->getType() != SliceTy)
V = convertValue(DL, IRB, V, SliceTy);
// Mix in the existing elements.
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
V = insertVector(IRB, Old, V, BeginIndex, "vec");
}
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2357,13 +2658,12 @@ private:
assert(IntTy && "We cannot extract an integer from the alloca");
assert(!SI.isVolatile());
if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
Old = convertValue(DL, IRB, Old, IntTy);
assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
- V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset,
- "insert");
+ V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
}
V = convertValue(DL, IRB, V, NewAllocaTy);
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2391,10 +2691,10 @@ private:
assert(V->getType()->isIntegerTy() &&
"Only integer type loads and stores are split");
assert(V->getType()->getIntegerBitWidth() ==
- DL.getTypeStoreSizeInBits(V->getType()) &&
+ DL.getTypeStoreSizeInBits(V->getType()) &&
"Non-byte-multiple bit width");
IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
- V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset,
+ V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
"extract");
}
@@ -2439,14 +2739,14 @@ private:
if (Size == 1)
return V;
- Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
- V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"),
- ConstantExpr::getUDiv(
- Constant::getAllOnesValue(SplatIntTy),
- ConstantExpr::getZExt(
- Constant::getAllOnesValue(V->getType()),
- SplatIntTy)),
- "isplat");
+ Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
+ V = IRB.CreateMul(
+ IRB.CreateZExt(V, SplatIntTy, "zext"),
+ ConstantExpr::getUDiv(
+ Constant::getAllOnesValue(SplatIntTy),
+ ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
+ SplatIntTy)),
+ "isplat");
return V;
}
@@ -2483,12 +2783,11 @@ private:
// If this doesn't map cleanly onto the alloca type, and that type isn't
// a single value type, just emit a memset.
if (!VecTy && !IntTy &&
- (BeginOffset > NewAllocaBeginOffset ||
- EndOffset < NewAllocaEndOffset ||
+ (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
SliceSize != DL.getTypeStoreSize(AllocaTy) ||
!AllocaTy->isSingleValueType() ||
!DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
- DL.getTypeSizeInBits(ScalarTy)%8 != 0)) {
+ DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) {
Type *SizeTy = II.getLength()->getType();
Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
CallInst *New = IRB.CreateMemSet(
@@ -2522,8 +2821,8 @@ private:
if (NumElements > 1)
Splat = getVectorSplat(Splat, NumElements);
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
} else if (IntTy) {
// If this is a memset on an alloca where we can widen stores, insert the
@@ -2535,8 +2834,8 @@ private:
if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
EndOffset != NewAllocaBeginOffset)) {
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
Old = convertValue(DL, IRB, Old, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
V = insertInteger(DL, IRB, Old, V, Offset, "insert");
@@ -2633,8 +2932,8 @@ private:
// Strip all inbounds GEPs and pointer casts to try to dig out any root
// alloca that should be re-examined after rewriting this instruction.
Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
- if (AllocaInst *AI
- = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+ if (AllocaInst *AI =
+ dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
assert(AI != &OldAI && AI != &NewAI &&
"Splittable transfers cannot reach the same alloca on both ends.");
Pass.Worklist.insert(AI);
@@ -2673,8 +2972,8 @@ private:
unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
unsigned NumElements = EndIndex - BeginIndex;
- IntegerType *SubIntTy
- = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr;
+ IntegerType *SubIntTy =
+ IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
// Reset the other pointer type to match the register type we're going to
// use, but using the address space of the original other pointer.
@@ -2703,27 +3002,25 @@ private:
Value *Src;
if (VecTy && !IsWholeAlloca && !IsDest) {
- Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
} else if (IntTy && !IsWholeAlloca && !IsDest) {
- Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "load");
+ Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
Src = convertValue(DL, IRB, Src, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
} else {
- Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(),
- "copyload");
+ Src =
+ IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
}
if (VecTy && !IsWholeAlloca && IsDest) {
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
} else if (IntTy && !IsWholeAlloca && IsDest) {
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- "oldload");
+ Value *Old =
+ IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
Old = convertValue(DL, IRB, Old, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
@@ -2746,8 +3043,8 @@ private:
// Record this instruction for deletion.
Pass.DeadInsts.insert(&II);
- ConstantInt *Size
- = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+ ConstantInt *Size =
+ ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
NewEndOffset - NewBeginOffset);
Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
Value *New;
@@ -2814,7 +3111,6 @@ private:
SelectUsers.insert(&SI);
return true;
}
-
};
}
@@ -2869,8 +3165,7 @@ private:
bool visitInstruction(Instruction &I) { return false; }
/// \brief Generic recursive split emission class.
- template <typename Derived>
- class OpSplitter {
+ template <typename Derived> class OpSplitter {
protected:
/// The builder used to form new instructions.
IRBuilderTy IRB;
@@ -2887,7 +3182,7 @@ private:
/// Initialize the splitter with an insertion point, Ptr and start with a
/// single zero GEP index.
OpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+ : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
public:
/// \brief Generic recursive split emission routine.
@@ -2943,7 +3238,7 @@ private:
struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+ : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
/// Emit a leaf load of a single value. This is called at the leaves of the
/// recursive emission to actually load values.
@@ -2974,7 +3269,7 @@ private:
struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
- : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+ : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
/// Emit a leaf store of a single value. This is called at the leaves of the
/// recursive emission to actually produce stores.
@@ -2982,8 +3277,8 @@ private:
assert(Ty->isSingleValueType());
// Extract the single value and store it using the indices.
Value *Store = IRB.CreateStore(
- IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
- IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
+ IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+ IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
(void)Store;
DEBUG(dbgs() << " to: " << *Store << "\n");
}
@@ -3069,8 +3364,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
/// when the size or offset cause either end of type-based partition to be off.
/// Also, this is a best-effort routine. It is reasonable to give up and not
/// return a type if necessary.
-static Type *getTypePartition(const DataLayout &DL, Type *Ty,
- uint64_t Offset, uint64_t Size) {
+static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
+ uint64_t Size) {
if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
return stripAggregateTypeWrapping(DL, Ty);
if (Offset > DL.getTypeAllocSize(Ty) ||
@@ -3162,8 +3457,8 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
}
// Try to build up a sub-structure.
- StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
- STy->isPacked());
+ StructType *SubTy =
+ StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
const StructLayout *SubSL = DL.getStructLayout(SubTy);
if (Size != SubSL->getSizeInBytes())
return nullptr; // The sub-struct doesn't have quite the size needed.
@@ -3171,6 +3466,494 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
return SubTy;
}
+/// \brief Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+/// %a = alloca [12 x i8]
+/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+/// %iptr1 = bitcast i8* %gep1 to i64*
+/// %iptr2 = bitcast i8* %gep2 to i64*
+/// %fptr1 = bitcast i8* %gep1 to float*
+/// %fptr2 = bitcast i8* %gep2 to float*
+/// %fptr3 = bitcast i8* %gep3 to float*
+/// store float 0.0, float* %fptr1
+/// store float 1.0, float* %fptr2
+/// %v = load i64* %iptr1
+/// store i64 %v, i64* %iptr2
+/// %f1 = load float* %fptr2
+/// %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+ DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+ // Track the loads and stores which are candidates for pre-splitting here, in
+ // the order they first appear during the partition scan. These give stable
+ // iteration order and a basis for tracking which loads and stores we
+ // actually split.
+ SmallVector<LoadInst *, 4> Loads;
+ SmallVector<StoreInst *, 4> Stores;
+
+ // We need to accumulate the splits required of each load or store where we
+ // can find them via a direct lookup. This is important to cross-check loads
+ // and stores against each other. We also track the slice so that we can kill
+ // all the slices that end up split.
+ struct SplitOffsets {
+ Slice *S;
+ std::vector<uint64_t> Splits;
+ };
+ SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+ // Track loads out of this alloca which cannot, for any reason, be pre-split.
+ // This is important as we also cannot pre-split stores of those loads!
+ // FIXME: This is all pretty gross. It means that we can be more aggressive
+ // in pre-splitting when the load feeding the store happens to come from
+ // a separate alloca. Put another way, the effectiveness of SROA would be
+ // decreased by a frontend which just concatenated all of its local allocas
+ // into one big flat alloca. But defeating such patterns is exactly the job
+ // SROA is tasked with! Sadly, to not have this discrepancy we would have
+ // change store pre-splitting to actually force pre-splitting of the load
+ // that feeds it *and all stores*. That makes pre-splitting much harder, but
+ // maybe it would make it more principled?
+ SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
+ DEBUG(dbgs() << " Searching for candidate loads and stores\n");
+ for (auto &P : AS.partitions()) {
+ for (Slice &S : P) {
+ Instruction *I = cast<Instruction>(S.getUse()->getUser());
+ if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) {
+ // If this was a load we have to track that it can't participate in any
+ // pre-splitting!
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+ assert(P.endOffset() > S.beginOffset() &&
+ "Empty or backwards partition!");
+
+ // Determine if this is a pre-splittable slice.
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+ // The load must be used exclusively to store into other pointers for
+ // us to be able to arbitrarily pre-split it. The stores must also be
+ // simple to avoid changing semantics.
+ auto IsLoadSimplyStored = [](LoadInst *LI) {
+ for (User *LU : LI->users()) {
+ auto *SI = dyn_cast<StoreInst>(LU);
+ if (!SI || !SI->isSimple())
+ return false;
+ }
+ return true;
+ };
+ if (!IsLoadSimplyStored(LI)) {
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+
+ Loads.push_back(LI);
+ } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
+ if (!SI ||
+ S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+ continue;
+ auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+ if (!StoredLoad || !StoredLoad->isSimple())
+ continue;
+ assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+ Stores.push_back(SI);
+ } else {
+ // Other uses cannot be pre-split.
+ continue;
+ }
+
+ // Record the initial split.
+ DEBUG(dbgs() << " Candidate: " << *I << "\n");
+ auto &Offsets = SplitOffsetsMap[I];
+ assert(Offsets.Splits.empty() &&
+ "Should not have splits the first time we see an instruction!");
+ Offsets.S = &S;
+ Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
+ }
+
+ // Now scan the already split slices, and add a split for any of them which
+ // we're going to pre-split.
+ for (Slice *S : P.splitSliceTails()) {
+ auto SplitOffsetsMapI =
+ SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+ if (SplitOffsetsMapI == SplitOffsetsMap.end())
+ continue;
+ auto &Offsets = SplitOffsetsMapI->second;
+
+ assert(Offsets.S == S && "Found a mismatched slice!");
+ assert(!Offsets.Splits.empty() &&
+ "Cannot have an empty set of splits on the second partition!");
+ assert(Offsets.Splits.back() ==
+ P.beginOffset() - Offsets.S->beginOffset() &&
+ "Previous split does not end where this one begins!");
+
+ // Record each split. The last partition's end isn't needed as the size
+ // of the slice dictates that.
+ if (S->endOffset() > P.endOffset())
+ Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
+ }
+ }
+
+ // We may have split loads where some of their stores are split stores. For
+ // such loads and stores, we can only pre-split them if their splits exactly
+ // match relative to their starting offset. We have to verify this prior to
+ // any rewriting.
+ Stores.erase(
+ std::remove_if(Stores.begin(), Stores.end(),
+ [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+ // Lookup the load we are storing in our map of split
+ // offsets.
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ // If it was completely unsplittable, then we're done,
+ // and this store can't be pre-split.
+ if (UnsplittableLoads.count(LI))
+ return true;
+
+ auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+ if (LoadOffsetsI == SplitOffsetsMap.end())
+ return false; // Unrelated loads are definitely safe.
+ auto &LoadOffsets = LoadOffsetsI->second;
+
+ // Now lookup the store's offsets.
+ auto &StoreOffsets = SplitOffsetsMap[SI];
+
+ // If the relative offsets of each split in the load and
+ // store match exactly, then we can split them and we
+ // don't need to remove them here.
+ if (LoadOffsets.Splits == StoreOffsets.Splits)
+ return false;
+
+ DEBUG(dbgs()
+ << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
+
+ // We've found a store and load that we need to split
+ // with mismatched relative splits. Just give up on them
+ // and remove both instructions from our list of
+ // candidates.
+ UnsplittableLoads.insert(LI);
+ return true;
+ }),
+ Stores.end());
+ // Now we have to go *back* through all te stores, because a later store may
+ // have caused an earlier store's load to become unsplittable and if it is
+ // unsplittable for the later store, then we can't rely on it being split in
+ // the earlier store either.
+ Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
+ [&UnsplittableLoads](StoreInst *SI) {
+ auto *LI =
+ cast<LoadInst>(SI->getValueOperand());
+ return UnsplittableLoads.count(LI);
+ }),
+ Stores.end());
+ // Once we've established all the loads that can't be split for some reason,
+ // filter any that made it into our list out.
+ Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
+ [&UnsplittableLoads](LoadInst *LI) {
+ return UnsplittableLoads.count(LI);
+ }),
+ Loads.end());
+
+
+ // If no loads or stores are left, there is no pre-splitting to be done for
+ // this alloca.
+ if (Loads.empty() && Stores.empty())
+ return false;
+
+ // From here on, we can't fail and will be building new accesses, so rig up
+ // an IR builder.
+ IRBuilderTy IRB(&AI);
+
+ // Collect the new slices which we will merge into the alloca slices.
+ SmallVector<Slice, 4> NewSlices;
+
+ // Track any allocas we end up splitting loads and stores for so we iterate
+ // on them.
+ SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+ // At this point, we have collected all of the loads and stores we can
+ // pre-split, and the specific splits needed for them. We actually do the
+ // splitting in a specific order in order to handle when one of the loads in
+ // the value operand to one of the stores.
+ //
+ // First, we rewrite all of the split loads, and just accumulate each split
+ // load in a parallel structure. We also build the slices for them and append
+ // them to the alloca slices.
+ SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+ std::vector<LoadInst *> SplitLoads;
+ for (LoadInst *LI : Loads) {
+ SplitLoads.clear();
+
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t LoadSize = Ty->getBitWidth() / 8;
+ assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+ auto &Offsets = SplitOffsetsMap[LI];
+ assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + LoadSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+ IRB.SetInsertPoint(BasicBlock::iterator(LI));
+
+ DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+ LoadInst *PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, *DL, BasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, BasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+ LI->getName());
+
+ // Append this load onto the list of split loads so we can find it later
+ // to rewrite the stores.
+ SplitLoads.push_back(PLoad);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset() << "): " << *PLoad
+ << "\n");
+
+ // See if we've handled all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+ }
+
+ // Now that we have the split loads, do the slow walk over all uses of the
+ // load and rewrite them as split stores, or save the split loads to use
+ // below if the store is going to be split there anyways.
+ bool DeferredStores = false;
+ for (User *LU : LI->users()) {
+ StoreInst *SI = cast<StoreInst>(LU);
+ if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+ DeferredStores = true;
+ DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n");
+ continue;
+ }
+
+ Value *StoreBasePtr = SI->getPointerOperand();
+ IRB.SetInsertPoint(BasicBlock::iterator(SI));
+
+ DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
+
+ for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+ LoadInst *PLoad = SplitLoads[Idx];
+ uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+ auto *PartPtrTy =
+ PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+ (void)PStore;
+ DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this store, and we have to track any promotable alloca (indicated by
+ // a direct store) as needing to be resplit because it is no longer
+ // promotable.
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ StoreBasePtr->stripInBoundsOffsets())) {
+ Worklist.insert(OtherAI);
+ }
+
+ // Mark the original store as dead.
+ DeadInsts.insert(SI);
+ }
+
+ // Save the split loads if there are deferred stores among the users.
+ if (DeferredStores)
+ SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+ // Mark the original load as dead and kill the original slice.
+ DeadInsts.insert(LI);
+ Offsets.S->kill();
+ }
+
+ // Second, we rewrite all of the split stores. At this point, we know that
+ // all loads from this alloca have been split already. For stores of such
+ // loads, we can simply look up the pre-existing split loads. For stores of
+ // other loads, we split those loads first and then write split stores of
+ // them.
+ for (StoreInst *SI : Stores) {
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t StoreSize = Ty->getBitWidth() / 8;
+ assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+ auto &Offsets = SplitOffsetsMap[SI];
+ assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + StoreSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Value *LoadBasePtr = LI->getPointerOperand();
+ Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+ DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
+
+ // Check whether we have an already split load.
+ auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+ std::vector<LoadInst *> *SplitLoads = nullptr;
+ if (SplitLoadsMapI != SplitLoadsMap.end()) {
+ SplitLoads = &SplitLoadsMapI->second;
+ assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+ "Too few split loads for the number of splits in the store!");
+ } else {
+ DEBUG(dbgs() << " of load: " << *LI << "\n");
+ }
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+ // Either lookup a split load or create one.
+ LoadInst *PLoad;
+ if (SplitLoads) {
+ PLoad = (*SplitLoads)[Idx];
+ } else {
+ IRB.SetInsertPoint(BasicBlock::iterator(LI));
+ PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, *DL, LoadBasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, LoadBasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+ LI->getName());
+ }
+
+ // And store this partition.
+ IRB.SetInsertPoint(BasicBlock::iterator(SI));
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset() << "): " << *PStore
+ << "\n");
+ if (!SplitLoads) {
+ DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
+ }
+
+ // See if we've finished all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this load, which is only relevant if it isn't a load of this alloca and
+ // thus we didn't already split the loads above. We also have to keep track
+ // of any promotable allocas we split loads on as they can no longer be
+ // promoted.
+ if (!SplitLoads) {
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ LoadBasePtr->stripInBoundsOffsets())) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ Worklist.insert(OtherAI);
+ }
+ }
+
+ // Mark the original store as dead now that we've split it up and kill its
+ // slice. Note that we leave the original load in place unless this store
+ // was its ownly use. It may in turn be split up if it is an alloca load
+ // for some other alloca, but it may be a normal load. This may introduce
+ // redundant loads, but where those can be merged the rest of the optimizer
+ // should handle the merging, and this uncovers SSA splits which is more
+ // important. In practice, the original loads will almost always be fully
+ // split and removed eventually, and the splits will be merged by any
+ // trivial CSE, including instcombine.
+ if (LI->hasOneUse()) {
+ assert(*LI->user_begin() == SI && "Single use isn't this store!");
+ DeadInsts.insert(LI);
+ }
+ DeadInsts.insert(SI);
+ Offsets.S->kill();
+ }
+
+ // Remove the killed slices that have ben pre-split.
+ AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
+ return S.isDead();
+ }), AS.end());
+
+ // Insert our new slices. This will sort and merge them into the sorted
+ // sequence.
+ AS.insert(NewSlices);
+
+ DEBUG(dbgs() << " Pre-split slices:\n");
+#ifndef NDEBUG
+ for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+ DEBUG(AS.print(dbgs(), I, " "));
+#endif
+
+ // Finally, don't try to promote any allocas that new require re-splitting.
+ // They have already been added to the worklist above.
+ PromotableAllocas.erase(
+ std::remove_if(
+ PromotableAllocas.begin(), PromotableAllocas.end(),
+ [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
+ PromotableAllocas.end());
+
+ return true;
+}
+
/// \brief Rewrite an alloca partition's users.
///
/// This routine drives both of the rewriting goals of the SROA pass. It tries
@@ -3181,40 +3964,31 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
/// appropriate new offsets. It also evaluates how successful the rewrite was
/// at enabling promotion and if it was successful queues the alloca to be
/// promoted.
-bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
- AllocaSlices::iterator B, AllocaSlices::iterator E,
- int64_t BeginOffset, int64_t EndOffset,
- ArrayRef<AllocaSlices::iterator> SplitUses) {
- assert(BeginOffset < EndOffset);
- uint64_t SliceSize = EndOffset - BeginOffset;
-
+AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+ AllocaSlices::Partition &P) {
// Try to compute a friendly type for this partition of the alloca. This
// won't always succeed, in which case we fall back to a legal integer type
// or an i8 array of an appropriate size.
Type *SliceTy = nullptr;
- if (Type *CommonUseTy = findCommonType(B, E, EndOffset))
- if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize)
+ if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
+ if (DL->getTypeAllocSize(CommonUseTy) >= P.size())
SliceTy = CommonUseTy;
if (!SliceTy)
if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(),
- BeginOffset, SliceSize))
+ P.beginOffset(), P.size()))
SliceTy = TypePartitionTy;
if ((!SliceTy || (SliceTy->isArrayTy() &&
SliceTy->getArrayElementType()->isIntegerTy())) &&
- DL->isLegalInteger(SliceSize * 8))
- SliceTy = Type::getIntNTy(*C, SliceSize * 8);
+ DL->isLegalInteger(P.size() * 8))
+ SliceTy = Type::getIntNTy(*C, P.size() * 8);
if (!SliceTy)
- SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize);
- assert(DL->getTypeAllocSize(SliceTy) >= SliceSize);
+ SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
+ assert(DL->getTypeAllocSize(SliceTy) >= P.size());
- bool IsIntegerPromotable = isIntegerWideningViable(
- *DL, SliceTy, BeginOffset, AllocaSlices::const_range(B, E), SplitUses);
+ bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL);
VectorType *VecTy =
- IsIntegerPromotable
- ? nullptr
- : isVectorPromotionViable(*DL, SliceTy, BeginOffset, EndOffset,
- AllocaSlices::const_range(B, E), SplitUses);
+ IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL);
if (VecTy)
SliceTy = VecTy;
@@ -3224,11 +3998,12 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// perform phi and select speculation.
AllocaInst *NewAI;
if (SliceTy == AI.getAllocatedType()) {
- assert(BeginOffset == 0 &&
+ assert(P.beginOffset() == 0 &&
"Non-zero begin offset but same alloca type");
NewAI = &AI;
// FIXME: We should be able to bail at this point with "nothing changed".
// FIXME: We might want to defer PHI speculation until after here.
+ // FIXME: return nullptr;
} else {
unsigned Alignment = AI.getAlignment();
if (!Alignment) {
@@ -3237,20 +4012,20 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// type.
Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
}
- Alignment = MinAlign(Alignment, BeginOffset);
+ Alignment = MinAlign(Alignment, P.beginOffset());
// If we will get at least this much alignment from the type alone, leave
// the alloca's alignment unconstrained.
if (Alignment <= DL->getABITypeAlignment(SliceTy))
Alignment = 0;
- NewAI =
- new AllocaInst(SliceTy, nullptr, Alignment,
- AI.getName() + ".sroa." + Twine(B - AS.begin()), &AI);
+ NewAI = new AllocaInst(
+ SliceTy, nullptr, Alignment,
+ AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
++NumNewAllocas;
}
DEBUG(dbgs() << "Rewriting alloca partition "
- << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI
- << "\n");
+ << "[" << P.beginOffset() << "," << P.endOffset()
+ << ") to: " << *NewAI << "\n");
// Track the high watermark on the worklist as it is only relevant for
// promoted allocas. We will reset it to this point if the alloca is not in
@@ -3260,20 +4035,16 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
SmallPtrSet<PHINode *, 8> PHIUsers;
SmallPtrSet<SelectInst *, 8> SelectUsers;
- AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, BeginOffset,
- EndOffset, IsIntegerPromotable, VecTy, PHIUsers,
- SelectUsers);
+ AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(),
+ P.endOffset(), IsIntegerPromotable, VecTy,
+ PHIUsers, SelectUsers);
bool Promotable = true;
- for (auto & SplitUse : SplitUses) {
- DEBUG(dbgs() << " rewriting split ");
- DEBUG(AS.printSlice(dbgs(), SplitUse, ""));
- Promotable &= Rewriter.visit(SplitUse);
+ for (Slice *S : P.splitSliceTails()) {
+ Promotable &= Rewriter.visit(S);
++NumUses;
}
- for (AllocaSlices::iterator I = B; I != E; ++I) {
- DEBUG(dbgs() << " rewriting ");
- DEBUG(AS.printSlice(dbgs(), I, ""));
- Promotable &= Rewriter.visit(I);
+ for (Slice &S : P) {
+ Promotable &= Rewriter.visit(&S);
++NumUses;
}
@@ -3328,32 +4099,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
PostPromotionWorklist.pop_back();
}
- return true;
-}
-
-static void
-removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses,
- uint64_t &MaxSplitUseEndOffset, uint64_t Offset) {
- if (Offset >= MaxSplitUseEndOffset) {
- SplitUses.clear();
- MaxSplitUseEndOffset = 0;
- return;
- }
-
- size_t SplitUsesOldSize = SplitUses.size();
- SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(),
- [Offset](const AllocaSlices::iterator &I) {
- return I->endOffset() <= Offset;
- }),
- SplitUses.end());
- if (SplitUsesOldSize == SplitUses.size())
- return;
-
- // Recompute the max. While this is linear, so is remove_if.
- MaxSplitUseEndOffset = 0;
- for (AllocaSlices::iterator SplitUse : SplitUses)
- MaxSplitUseEndOffset =
- std::max(SplitUse->endOffset(), MaxSplitUseEndOffset);
+ return NewAI;
}
/// \brief Walks the slices of an alloca and form partitions based on them,
@@ -3364,108 +4110,100 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
unsigned NumPartitions = 0;
bool Changed = false;
- SmallVector<AllocaSlices::iterator, 4> SplitUses;
- uint64_t MaxSplitUseEndOffset = 0;
-
- uint64_t BeginOffset = AS.begin()->beginOffset();
-
- for (AllocaSlices::iterator SI = AS.begin(), SJ = std::next(SI),
- SE = AS.end();
- SI != SE; SI = SJ) {
- uint64_t MaxEndOffset = SI->endOffset();
-
- if (!SI->isSplittable()) {
- // When we're forming an unsplittable region, it must always start at the
- // first slice and will extend through its end.
- assert(BeginOffset == SI->beginOffset());
-
- // Form a partition including all of the overlapping slices with this
- // unsplittable slice.
- while (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
- if (!SJ->isSplittable())
- MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
- ++SJ;
- }
- } else {
- assert(SI->isSplittable()); // Established above.
-
- // Collect all of the overlapping splittable slices.
- while (SJ != SE && SJ->beginOffset() < MaxEndOffset &&
- SJ->isSplittable()) {
- MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
- ++SJ;
- }
-
- // Back up MaxEndOffset and SJ if we ended the span early when
- // encountering an unsplittable slice.
- if (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
- assert(!SJ->isSplittable());
- MaxEndOffset = SJ->beginOffset();
- }
- }
-
- // Check if we have managed to move the end offset forward yet. If so,
- // we'll have to rewrite uses and erase old split uses.
- if (BeginOffset < MaxEndOffset) {
- // Rewrite a sequence of overlapping slices.
- Changed |= rewritePartition(AI, AS, SI, SJ, BeginOffset, MaxEndOffset,
- SplitUses);
- ++NumPartitions;
-
- removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset);
- }
- // Accumulate all the splittable slices from the [SI,SJ) region which
- // overlap going forward.
- for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK)
- if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) {
- SplitUses.push_back(SK);
- MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset);
- }
-
- // If we're already at the end and we have no split uses, we're done.
- if (SJ == SE && SplitUses.empty())
- break;
+ // First try to pre-split loads and stores.
+ Changed |= presplitLoadsAndStores(AI, AS);
- // If we have no split uses or no gap in offsets, we're ready to move to
- // the next slice.
- if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) {
- BeginOffset = SJ->beginOffset();
+ // Now that we have identified any pre-splitting opportunities, mark any
+ // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+ // to split these during pre-splitting, we want to force them to be
+ // rewritten into a partition.
+ bool IsSorted = true;
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
continue;
- }
-
- // Even if we have split slices, if the next slice is splittable and the
- // split slices reach it, we can simply set up the beginning offset of the
- // next iteration to bridge between them.
- if (SJ != SE && SJ->isSplittable() &&
- MaxSplitUseEndOffset > SJ->beginOffset()) {
- BeginOffset = MaxEndOffset;
+ // FIXME: We currently leave whole-alloca splittable loads and stores. This
+ // used to be the only splittable loads and stores and we need to be
+ // confident that the above handling of splittable loads and stores is
+ // completely sufficient before we forcibly disable the remaining handling.
+ if (S.beginOffset() == 0 &&
+ S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType()))
continue;
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
+ }
+ }
+ if (!IsSorted)
+ std::sort(AS.begin(), AS.end());
+
+ /// \brief Describes the allocas introduced by rewritePartition
+ /// in order to migrate the debug info.
+ struct Piece {
+ AllocaInst *Alloca;
+ uint64_t Offset;
+ uint64_t Size;
+ Piece(AllocaInst *AI, uint64_t O, uint64_t S)
+ : Alloca(AI), Offset(O), Size(S) {}
+ };
+ SmallVector<Piece, 4> Pieces;
+
+ // Rewrite each partition.
+ for (auto &P : AS.partitions()) {
+ if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
+ Changed = true;
+ if (NewAI != &AI) {
+ uint64_t SizeOfByte = 8;
+ uint64_t AllocaSize = DL->getTypeSizeInBits(NewAI->getAllocatedType());
+ // Don't include any padding.
+ uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
+ Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
+ }
}
-
- // Otherwise, we have a tail of split slices. Rewrite them with an empty
- // range of slices.
- uint64_t PostSplitEndOffset =
- SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset();
-
- Changed |= rewritePartition(AI, AS, SJ, SJ, MaxEndOffset,
- PostSplitEndOffset, SplitUses);
++NumPartitions;
-
- if (SJ == SE)
- break; // Skip the rest, we don't need to do any cleanup.
-
- removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset,
- PostSplitEndOffset);
-
- // Now just reset the begin offset for the next iteration.
- BeginOffset = SJ->beginOffset();
}
NumAllocaPartitions += NumPartitions;
MaxPartitionsPerAlloca =
std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
+ // Migrate debug information from the old alloca to the new alloca(s)
+ // and the individial partitions.
+ if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
+ DIVariable Var(DbgDecl->getVariable());
+ DIExpression Expr(DbgDecl->getExpression());
+ DIBuilder DIB(*AI.getParent()->getParent()->getParent(),
+ /*AllowUnresolved*/ false);
+ bool IsSplit = Pieces.size() > 1;
+ for (auto Piece : Pieces) {
+ // Create a piece expression describing the new partition or reuse AI's
+ // expression if there is only one partition.
+ DIExpression PieceExpr = Expr;
+ if (IsSplit || Expr.isBitPiece()) {
+ // If this alloca is already a scalar replacement of a larger aggregate,
+ // Piece.Offset describes the offset inside the scalar.
+ uint64_t Offset = Expr.isBitPiece() ? Expr.getBitPieceOffset() : 0;
+ uint64_t Start = Offset + Piece.Offset;
+ uint64_t Size = Piece.Size;
+ if (Expr.isBitPiece()) {
+ uint64_t AbsEnd = Expr.getBitPieceOffset() + Expr.getBitPieceSize();
+ if (Start >= AbsEnd)
+ // No need to describe a SROAed padding.
+ continue;
+ Size = std::min(Size, AbsEnd - Start);
+ }
+ PieceExpr = DIB.createBitPieceExpression(Start, Size);
+ }
+
+ // Remove any existing dbg.declare intrinsic describing the same alloca.
+ if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
+ OldDDI->eraseFromParent();
+
+ auto *NewDDI = DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, &AI);
+ NewDDI->setDebugLoc(DbgDecl->getDebugLoc());
+ }
+ }
return Changed;
}
@@ -3561,7 +4299,8 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
///
/// We also record the alloca instructions deleted here so that they aren't
/// subsequently handed to mem2reg to promote.
-void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas) {
+void SROA::deleteDeadInstructions(
+ SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
while (!DeadInsts.empty()) {
Instruction *I = DeadInsts.pop_back_val();
DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
@@ -3576,8 +4315,11 @@ void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas)
DeadInsts.insert(U);
}
- if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
DeletedAllocas.insert(AI);
+ if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
+ DbgDecl->eraseFromParent();
+ }
++NumDeleted;
I->eraseFromParent();
@@ -3608,14 +4350,14 @@ bool SROA::promoteAllocas(Function &F) {
if (DT && !ForceSSAUpdater) {
DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
- PromoteMemToReg(PromotableAllocas, *DT, nullptr, AT);
+ PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
PromotableAllocas.clear();
return true;
}
DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
SSAUpdater SSA;
- DIBuilder DIB(*F.getParent());
+ DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
SmallVector<Instruction *, 64> Insts;
// We need a worklist to walk the uses of each alloca.
@@ -3690,13 +4432,14 @@ bool SROA::runOnFunction(Function &F) {
DominatorTreeWrapperPass *DTWP =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DT = DTWP ? &DTWP->getDomTree() : nullptr;
- AT = &getAnalysis<AssumptionTracker>();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
BasicBlock &EntryBB = F.getEntryBlock();
for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
- I != E; ++I)
+ I != E; ++I) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
Worklist.insert(AI);
+ }
bool Changed = false;
// A set of deleted alloca instruction pointers which should be removed from
@@ -3711,9 +4454,7 @@ bool SROA::runOnFunction(Function &F) {
// Remove the deleted allocas from various lists so that we don't try to
// continue processing them.
if (!DeletedAllocas.empty()) {
- auto IsInSet = [&](AllocaInst *AI) {
- return DeletedAllocas.count(AI);
- };
+ auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
Worklist.remove_if(IsInSet);
PostPromotionWorklist.remove_if(IsInSet);
PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
@@ -3734,7 +4475,7 @@ bool SROA::runOnFunction(Function &F) {
}
void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<AssumptionTracker>();
+ AU.addRequired<AssumptionCacheTracker>();
if (RequiresDomTree)
AU.addRequired<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
index 179bbf7..c7232a9 100644
--- a/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -95,7 +95,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<PostDominatorTree>();
}
@@ -731,7 +731,7 @@ INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile",
"Sample Profile loader", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AddDiscriminators)
INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile",
"Sample Profile loader", false, false)
@@ -762,7 +762,7 @@ bool SampleProfileLoader::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
PDT = &getAnalysis<PostDominatorTree>();
- LI = &getAnalysis<LoopInfo>();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
Ctx = &F.getParent()->getContext();
Samples = Reader->getSamplesFor(F);
if (!Samples->empty())
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index a16e9e2..621633b 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -20,7 +20,7 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Verifier.h"
#include "llvm/InitializePasses.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
using namespace llvm;
@@ -28,6 +28,7 @@ using namespace llvm;
/// ScalarOpts library.
void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeADCEPass(Registry);
+ initializeBDCEPass(Registry);
initializeAlignmentFromAssumptionsPass(Registry);
initializeSampleProfileLoaderPass(Registry);
initializeConstantHoistingPass(Registry);
@@ -38,12 +39,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeScalarizerPass(Registry);
initializeDSEPass(Registry);
initializeGVNPass(Registry);
- initializeEarlyCSEPass(Registry);
+ initializeEarlyCSELegacyPassPass(Registry);
initializeFlattenCFGPassPass(Registry);
+ initializeInductiveRangeCheckEliminationPass(Registry);
initializeIndVarSimplifyPass(Registry);
initializeJumpThreadingPass(Registry);
initializeLICMPass(Registry);
initializeLoopDeletionPass(Registry);
+ initializeLoopAccessAnalysisPass(Registry);
initializeLoopInstSimplifyPass(Registry);
initializeLoopRotatePass(Registry);
initializeLoopStrengthReducePass(Registry);
@@ -58,6 +61,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializePartiallyInlineLibCallsPass(Registry);
initializeReassociatePass(Registry);
initializeRegToMemPass(Registry);
+ initializeRewriteStatepointsForGCPass(Registry);
initializeSCCPPass(Registry);
initializeIPSCCPPass(Registry);
initializeSROAPass(Registry);
@@ -68,7 +72,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeSinkingPass(Registry);
initializeTailCallElimPass(Registry);
initializeSeparateConstOffsetFromGEPPass(Registry);
+ initializeStraightLineStrengthReducePass(Registry);
initializeLoadCombinePass(Registry);
+ initializePlaceBackedgeSafepointsImplPass(Registry);
+ initializePlaceSafepointsPass(Registry);
}
void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
@@ -79,6 +86,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createAggressiveDCEPass());
}
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createBitTrackingDCEPass());
+}
+
void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createAlignmentFromAssumptionsPass());
}
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index f7fa917..5c49a55 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -23,7 +23,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CallSite.h"
@@ -198,7 +198,7 @@ namespace {
// getAnalysisUsage - This pass does not require any passes, but we know it
// will not alter the CFG, so say so.
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionTracker>();
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
}
@@ -216,7 +216,7 @@ namespace {
// getAnalysisUsage - This pass does not require any passes, but we know it
// will not alter the CFG, so say so.
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionTracker>();
+ AU.addRequired<AssumptionCacheTracker>();
AU.setPreservesCFG();
}
};
@@ -228,14 +228,14 @@ char SROA_SSAUp::ID = 0;
INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
"Scalar Replacement of Aggregates (DT)", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
"Scalar Replacement of Aggregates (DT)", false, false)
INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa",
"Scalar Replacement of Aggregates (SSAUp)", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
"Scalar Replacement of Aggregates (SSAUp)", false, false)
@@ -1068,12 +1068,14 @@ public:
void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
// Remember which alloca we're promoting (for isInstInList).
this->AI = AI;
- if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) {
- for (User *U : DebugNode->users())
- if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
- DDIs.push_back(DDI);
- else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
- DVIs.push_back(DVI);
+ if (auto *L = LocalAsMetadata::getIfExists(AI)) {
+ if (auto *DebugNode = MetadataAsValue::getIfExists(AI->getContext(), L)) {
+ for (User *U : DebugNode->users())
+ if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+ DDIs.push_back(DDI);
+ else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+ DVIs.push_back(DVI);
+ }
}
LoadAndStorePromoter::run(Insts);
@@ -1417,10 +1419,11 @@ bool SROA::performPromotion(Function &F) {
DominatorTree *DT = nullptr;
if (HasDomTree)
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
- DIBuilder DIB(*F.getParent());
+ DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
bool Changed = false;
SmallVector<Instruction*, 64> Insts;
while (1) {
@@ -1436,7 +1439,7 @@ bool SROA::performPromotion(Function &F) {
if (Allocas.empty()) break;
if (HasDomTree)
- PromoteMemToReg(Allocas, *DT, nullptr, AT);
+ PromoteMemToReg(Allocas, *DT, nullptr, &AC);
else {
SSAUpdater SSA;
for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6157746..bffe8df 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -313,7 +313,8 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DataLayoutPass>();
- AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
}
bool doInitialization(Module &M) override {
@@ -384,7 +385,7 @@ INITIALIZE_PASS_BEGIN(
SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
"Split GEPs to a variadic base and a constant offset for better CSE", false,
false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DataLayoutPass)
INITIALIZE_PASS_END(
SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
@@ -857,7 +858,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// of variable indices. Therefore, we don't check for addressing modes in that
// case.
if (!LowerGEP) {
- TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *GEP->getParent()->getParent());
if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
/*BaseGV=*/nullptr, AccumulativeByteOffset,
/*HasBaseReg=*/true, /*Scale=*/0)) {
@@ -910,7 +913,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
if (LowerGEP) {
// As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
// arithmetic operations if the target uses alias analysis in codegen.
- if (TM && TM->getSubtarget<TargetSubtargetInfo>().useAA())
+ if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
else
lowerToArithmetics(GEP, AccumulativeByteOffset);
@@ -996,6 +999,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
}
bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
if (DisableSeparateConstOffsetFromGEP)
return false;
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 046a7cb..fb8fe38 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -21,11 +21,11 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CFG.h"
@@ -37,6 +37,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
using namespace llvm;
#define DEBUG_TYPE "simplifycfg"
@@ -47,36 +48,6 @@ UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1),
STATISTIC(NumSimpl, "Number of blocks simplified");
-namespace {
-struct CFGSimplifyPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- unsigned BonusInstThreshold;
- CFGSimplifyPass(int T = -1) : FunctionPass(ID) {
- BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
- initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionTracker>();
- AU.addRequired<TargetTransformInfo>();
- }
-};
-}
-
-char CFGSimplifyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
- false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
-INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
- false)
-
-// Public interface to the CFGSimplification pass
-FunctionPass *llvm::createCFGSimplificationPass(int Threshold) {
- return new CFGSimplifyPass(Threshold);
-}
-
/// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
/// node) return blocks, merge them together to promote recursive block merging.
static bool mergeEmptyReturnBlocks(Function &F) {
@@ -156,8 +127,7 @@ static bool mergeEmptyReturnBlocks(Function &F) {
/// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
/// iterating until no more changes are made.
static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
- const DataLayout *DL,
- AssumptionTracker *AT,
+ const DataLayout *DL, AssumptionCache *AC,
unsigned BonusInstThreshold) {
bool Changed = false;
bool LocalChange = true;
@@ -167,7 +137,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
// Loop over all of the basic blocks and remove them if they are unneeded...
//
for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
- if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AT)) {
+ if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AC)) {
LocalChange = true;
++NumSimpl;
}
@@ -177,20 +147,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
return Changed;
}
-// It is possible that we may require multiple passes over the code to fully
-// simplify the CFG.
-//
-bool CFGSimplifyPass::runOnFunction(Function &F) {
- if (skipOptnoneFunction(F))
- return false;
-
- AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
- const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
- DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
- const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
+static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
+ const DataLayout *DL, AssumptionCache *AC,
+ int BonusInstThreshold) {
bool EverChanged = removeUnreachableBlocks(F);
EverChanged |= mergeEmptyReturnBlocks(F);
- EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold);
+ EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold);
// If neither pass changed anything, we're done.
if (!EverChanged) return false;
@@ -204,9 +166,69 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
return true;
do {
- EverChanged = iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold);
+ EverChanged = iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold);
EverChanged |= removeUnreachableBlocks(F);
} while (EverChanged);
return true;
}
+
+SimplifyCFGPass::SimplifyCFGPass()
+ : BonusInstThreshold(UserBonusInstThreshold) {}
+
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
+ : BonusInstThreshold(BonusInstThreshold) {}
+
+PreservedAnalyses SimplifyCFGPass::run(Function &F,
+ AnalysisManager<Function> *AM) {
+ auto *DL = F.getParent()->getDataLayout();
+ auto &TTI = AM->getResult<TargetIRAnalysis>(F);
+ auto &AC = AM->getResult<AssumptionAnalysis>(F);
+
+ if (!simplifyFunctionCFG(F, TTI, DL, &AC, BonusInstThreshold))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+struct CFGSimplifyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ unsigned BonusInstThreshold;
+ CFGSimplifyPass(int T = -1) : FunctionPass(ID) {
+ BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
+ initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+ const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
+ return simplifyFunctionCFG(F, TTI, DL, AC, BonusInstThreshold);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+};
+}
+
+char CFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+ false)
+
+// Public interface to the CFGSimplification pass
+FunctionPass *llvm::createCFGSimplificationPass(int Threshold) {
+ return new CFGSimplifyPass(Threshold);
+}
+
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 903b675..d0ee0a6 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -50,9 +50,9 @@ namespace {
FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AliasAnalysis>();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfo>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LoopInfo>();
+ AU.addPreserved<LoopInfoWrapperPass>();
}
private:
bool ProcessBlock(BasicBlock &BB);
@@ -64,7 +64,7 @@ namespace {
char Sinking::ID = 0;
INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false)
@@ -98,7 +98,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
bool Sinking::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LI = &getAnalysis<LoopInfo>();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
AA = &getAnalysis<AliasAnalysis>();
DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
DL = DLP ? &DLP->getDataLayout() : nullptr;
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
new file mode 100644
index 0000000..4edc86c
--- /dev/null
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -0,0 +1,274 @@
+//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements straight-line strength reduction (SLSR). Unlike loop
+// strength reduction, this algorithm is designed to reduce arithmetic
+// redundancy in straight-line code instead of loops. It has proven to be
+// effective in simplifying arithmetic statements derived from an unrolled loop.
+// It can also simplify the logic of SeparateConstOffsetFromGEP.
+//
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidate in the form of
+//
+// (B + i) * S
+//
+// where B and S are integer constants or variables, and i is a constant
+// integer. If we found two such candidates
+//
+// S1: X = (B + i) * S S2: Y = (B + i') * S
+//
+// and S1 dominates S2, we call S1 a basis of S2, and can replace S2 with
+//
+// Y = X + (i' - i) * S
+//
+// where (i' - i) * S is folded to the extent possible. When S2 has multiple
+// bases, we pick the one that is closest to S2, or S2's "immediate" basis.
+//
+// TODO:
+//
+// - Handle candidates in the form of B + i * S
+//
+// - Handle candidates in the form of pointer arithmetics. e.g., B[i * S]
+//
+// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+// sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+// left as future work.
+#include <vector>
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+namespace {
+
+class StraightLineStrengthReduce : public FunctionPass {
+ public:
+ // SLSR candidate. Such a candidate must be in the form of
+ // (Base + Index) * Stride
+ struct Candidate : public ilist_node<Candidate> {
+ Candidate(Value *B = nullptr, ConstantInt *Idx = nullptr,
+ Value *S = nullptr, Instruction *I = nullptr)
+ : Base(B), Index(Idx), Stride(S), Ins(I), Basis(nullptr) {}
+ Value *Base;
+ ConstantInt *Index;
+ Value *Stride;
+ // The instruction this candidate corresponds to. It helps us to rewrite a
+ // candidate with respect to its immediate basis. Note that one instruction
+ // can corresponds to multiple candidates depending on how you associate the
+ // expression. For instance,
+ //
+ // (a + 1) * (b + 2)
+ //
+ // can be treated as
+ //
+ // <Base: a, Index: 1, Stride: b + 2>
+ //
+ // or
+ //
+ // <Base: b, Index: 2, Stride: a + 1>
+ Instruction *Ins;
+ // Points to the immediate basis of this candidate, or nullptr if we cannot
+ // find any basis for this candidate.
+ Candidate *Basis;
+ };
+
+ static char ID;
+
+ StraightLineStrengthReduce() : FunctionPass(ID), DT(nullptr) {
+ initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ // We do not modify the shape of the CFG.
+ AU.setPreservesCFG();
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ private:
+ // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+ // share the same base and stride.
+ bool isBasisFor(const Candidate &Basis, const Candidate &C);
+ // Checks whether I is in a candidate form. If so, adds all the matching forms
+ // to Candidates, and tries to find the immediate basis for each of them.
+ void allocateCandidateAndFindBasis(Instruction *I);
+ // Given that I is in the form of "(B + Idx) * S", adds this form to
+ // Candidates, and finds its immediate basis.
+ void allocateCandidateAndFindBasis(Value *B, ConstantInt *Idx, Value *S,
+ Instruction *I);
+ // Rewrites candidate C with respect to Basis.
+ void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+
+ DominatorTree *DT;
+ ilist<Candidate> Candidates;
+ // Temporarily holds all instructions that are unlinked (but not deleted) by
+ // rewriteCandidateWithBasis. These instructions will be actually removed
+ // after all rewriting finishes.
+ DenseSet<Instruction *> UnlinkedInstructions;
+};
+} // anonymous namespace
+
+char StraightLineStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
+ "Straight line strength reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
+ "Straight line strength reduction", false, false)
+
+FunctionPass *llvm::createStraightLineStrengthReducePass() {
+ return new StraightLineStrengthReduce();
+}
+
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+ const Candidate &C) {
+ return (Basis.Ins != C.Ins && // skip the same instruction
+ // Basis must dominate C in order to rewrite C with respect to Basis.
+ DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+ // They share the same base and stride.
+ Basis.Base == C.Base &&
+ Basis.Stride == C.Stride);
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear to
+// the number of existing candidates. However, a better algorithm exists. We
+// could depth-first search the dominator tree, and maintain a hash table that
+// contains all candidates that dominate the node being traversed. This hash
+// table is indexed by the base and the stride of a candidate. Therefore,
+// finding the immediate basis of a candidate boils down to one hash-table look
+// up.
+void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B,
+ ConstantInt *Idx,
+ Value *S,
+ Instruction *I) {
+ Candidate C(B, Idx, S, I);
+ // Try to compute the immediate basis of C.
+ unsigned NumIterations = 0;
+ // Limit the scan radius to avoid running forever.
+ static const unsigned MaxNumIterations = 50;
+ for (auto Basis = Candidates.rbegin();
+ Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+ ++Basis, ++NumIterations) {
+ if (isBasisFor(*Basis, C)) {
+ C.Basis = &(*Basis);
+ break;
+ }
+ }
+ // Regardless of whether we find a basis for C, we need to push C to the
+ // candidate list.
+ Candidates.push_back(C);
+}
+
+void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Instruction *I) {
+ Value *B = nullptr;
+ ConstantInt *Idx = nullptr;
+ // "(Base + Index) * Stride" must be a Mul instruction at the first hand.
+ if (I->getOpcode() == Instruction::Mul) {
+ if (IntegerType *ITy = dyn_cast<IntegerType>(I->getType())) {
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ for (unsigned Swapped = 0; Swapped < 2; ++Swapped) {
+ // Only handle the canonical operand ordering.
+ if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) {
+ // If LHS is in the form of "Base + Index", then I is in the form of
+ // "(Base + Index) * RHS".
+ allocateCandidateAndFindBasis(B, Idx, RHS, I);
+ } else {
+ // Otherwise, at least try the form (LHS + 0) * RHS.
+ allocateCandidateAndFindBasis(LHS, ConstantInt::get(ITy, 0), RHS, I);
+ }
+ // Swap LHS and RHS so that we also cover the cases where LHS is the
+ // stride.
+ if (LHS == RHS)
+ break;
+ std::swap(LHS, RHS);
+ }
+ }
+ }
+}
+
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+ const Candidate &C, const Candidate &Basis) {
+ // An instruction can correspond to multiple candidates. Therefore, instead of
+ // simply deleting an instruction when we rewrite it, we mark its parent as
+ // nullptr (i.e. unlink it) so that we can skip the candidates whose
+ // instruction is already rewritten.
+ if (!C.Ins->getParent())
+ return;
+ assert(C.Base == Basis.Base && C.Stride == Basis.Stride);
+ // Basis = (B + i) * S
+ // C = (B + i') * S
+ // ==>
+ // C = Basis + (i' - i) * S
+ IRBuilder<> Builder(C.Ins);
+ ConstantInt *IndexOffset = ConstantInt::get(
+ C.Ins->getContext(), C.Index->getValue() - Basis.Index->getValue());
+ Value *Reduced;
+ // TODO: preserve nsw/nuw in some cases.
+ if (IndexOffset->isOne()) {
+ // If (i' - i) is 1, fold C into Basis + S.
+ Reduced = Builder.CreateAdd(Basis.Ins, C.Stride);
+ } else if (IndexOffset->isMinusOne()) {
+ // If (i' - i) is -1, fold C into Basis - S.
+ Reduced = Builder.CreateSub(Basis.Ins, C.Stride);
+ } else {
+ Value *Bump = Builder.CreateMul(C.Stride, IndexOffset);
+ Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+ }
+ Reduced->takeName(C.Ins);
+ C.Ins->replaceAllUsesWith(Reduced);
+ C.Ins->dropAllReferences();
+ // Unlink C.Ins so that we can skip other candidates also corresponding to
+ // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+ C.Ins->removeFromParent();
+ UnlinkedInstructions.insert(C.Ins);
+}
+
+bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ // Traverse the dominator tree in the depth-first order. This order makes sure
+ // all bases of a candidate are in Candidates when we process it.
+ for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+ node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) {
+ BasicBlock *B = node->getBlock();
+ for (auto I = B->begin(); I != B->end(); ++I) {
+ allocateCandidateAndFindBasis(I);
+ }
+ }
+
+ // Rewrite candidates in the reverse depth-first order. This order makes sure
+ // a candidate being rewritten is not a basis for any other candidate.
+ while (!Candidates.empty()) {
+ const Candidate &C = Candidates.back();
+ if (C.Basis != nullptr) {
+ rewriteCandidateWithBasis(C, *C.Basis);
+ }
+ Candidates.pop_back();
+ }
+
+ // Delete all unlink instructions.
+ for (auto I : UnlinkedInstructions) {
+ delete I;
+ }
+ bool Ret = !UnlinkedInstructions.empty();
+ UnlinkedInstructions.clear();
+ return Ret;
+}
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index b9673ed..aaf6f9a 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -10,11 +10,14 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Analysis/RegionIterator.h"
#include "llvm/Analysis/RegionPass.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
using namespace llvm;
@@ -166,6 +169,7 @@ class StructurizeCFG : public RegionPass {
Region *ParentRegion;
DominatorTree *DT;
+ LoopInfo *LI;
RNVector Order;
BBSet Visited;
@@ -247,6 +251,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequiredID(LowerSwitchID);
AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
RegionPass::getAnalysisUsage(AU);
}
@@ -278,11 +283,65 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
/// \brief Build up the general order of nodes
void StructurizeCFG::orderNodes() {
- scc_iterator<Region *> I = scc_begin(ParentRegion);
- for (Order.clear(); !I.isAtEnd(); ++I) {
- const std::vector<RegionNode *> &Nodes = *I;
- Order.append(Nodes.begin(), Nodes.end());
+ RNVector TempOrder;
+ ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
+ TempOrder.append(RPOT.begin(), RPOT.end());
+
+ std::map<Loop*, unsigned> LoopBlocks;
+
+
+ // The reverse post-order traversal of the list gives us an ordering close
+ // to what we want. The only problem with it is that sometimes backedges
+ // for outer loops will be visited before backedges for inner loops.
+ for (RegionNode *RN : TempOrder) {
+ BasicBlock *BB = RN->getEntry();
+ Loop *Loop = LI->getLoopFor(BB);
+ if (!LoopBlocks.count(Loop)) {
+ LoopBlocks[Loop] = 1;
+ continue;
+ }
+ LoopBlocks[Loop]++;
}
+
+ unsigned CurrentLoopDepth = 0;
+ Loop *CurrentLoop = nullptr;
+ BBSet TempVisited;
+ for (RNVector::iterator I = TempOrder.begin(), E = TempOrder.end(); I != E; ++I) {
+ BasicBlock *BB = (*I)->getEntry();
+ unsigned LoopDepth = LI->getLoopDepth(BB);
+
+ if (std::find(Order.begin(), Order.end(), *I) != Order.end())
+ continue;
+
+ if (LoopDepth < CurrentLoopDepth) {
+ // Make sure we have visited all blocks in this loop before moving back to
+ // the outer loop.
+
+ RNVector::iterator LoopI = I;
+ while(LoopBlocks[CurrentLoop]) {
+ LoopI++;
+ BasicBlock *LoopBB = (*LoopI)->getEntry();
+ if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+ LoopBlocks[CurrentLoop]--;
+ Order.push_back(*LoopI);
+ }
+ }
+ }
+
+ CurrentLoop = LI->getLoopFor(BB);
+ if (CurrentLoop) {
+ LoopBlocks[CurrentLoop]--;
+ }
+
+ CurrentLoopDepth = LoopDepth;
+ Order.push_back(*I);
+ }
+
+ // This pass originally used a post-order traversal and then operated on
+ // the list in reverse. Now that we are using a reverse post-order traversal
+ // rather than re-working the whole pass to operate on the list in order,
+ // we just reverse the list and continue to operate on it in reverse.
+ std::reverse(Order.begin(), Order.end());
}
/// \brief Determine the end of the loops
@@ -301,8 +360,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
BasicBlock *Succ = Term->getSuccessor(i);
- if (Visited.count(Succ))
+ if (Visited.count(Succ)) {
Loops[Succ] = BB;
+ }
}
}
}
@@ -437,6 +497,10 @@ void StructurizeCFG::collectInfos() {
for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
OI != OE; ++OI) {
+ DEBUG(dbgs() << "Visiting: " <<
+ ((*OI)->isSubRegion() ? "SubRegion with entry: " : "") <<
+ (*OI)->getEntry()->getName() << " Loop Depth: " << LI->getLoopDepth((*OI)->getEntry()) << "\n");
+
// Analyze all the conditions leading to a node
gatherPredicates(*OI);
@@ -862,6 +926,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
ParentRegion = R;
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
orderNodes();
collectInfos();
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index f3c3e30..715ddeb 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -126,7 +126,7 @@ namespace {
char TailCallElim::ID = 0;
INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim",
"Tail Call Elimination", false, false)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(TailCallElim, "tailcallelim",
"Tail Call Elimination", false, false)
@@ -136,7 +136,7 @@ FunctionPass *llvm::createTailCallEliminationPass() {
}
void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<TargetTransformInfo>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
}
/// \brief Scan the specified function for alloca instructions.
@@ -386,7 +386,7 @@ bool TailCallElim::runTRE(Function &F) {
// right, so don't even try to convert it...
if (F.getFunctionType()->isVarArg()) return false;
- TTI = &getAnalysis<TargetTransformInfo>();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
BasicBlock *OldEntry = nullptr;
bool TailCallsAreMarkedTail = false;
SmallVector<PHINode*, 8> ArgumentPHIs;