diff options
Diffstat (limited to 'lib/Transforms/Scalar')
34 files changed, 6075 insertions, 2492 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index b344952..a097308 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -16,16 +16,16 @@ #define DEBUG_TYPE "adce" #include "llvm/Transforms/Scalar.h" -#include "llvm/BasicBlock.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/InstIterator.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/InstIterator.h" using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed"); diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp index cee5502..e755008 100644 --- a/lib/Transforms/Scalar/BasicBlockPlacement.cpp +++ b/lib/Transforms/Scalar/BasicBlockPlacement.cpp @@ -27,12 +27,12 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "block-placement" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ProfileInfo.h" -#include "llvm/Function.h" +#include "llvm/IR/Function.h" #include "llvm/Pass.h" #include "llvm/Support/CFG.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Transforms/Scalar.h" #include <set> using namespace llvm; diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index a01e066..b3fc6e3 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_library(LLVMScalarOpts Reassociate.cpp Reg2Mem.cpp SCCP.cpp + SROA.cpp Scalar.cpp ScalarReplAggregates.cpp SimplifyCFGPass.cpp diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 5912107..d513c96 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -15,21 +15,23 @@ #define DEBUG_TYPE "codegenprepare" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/InlineAsm.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DominatorInternals.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ProfileInfo.h" #include "llvm/Assembly/Writer.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -37,10 +39,8 @@ #include "llvm/Support/PatternMatch.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Target/TargetLowering.h" -#include "llvm/Transforms/Utils/AddrModeMatcher.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" @@ -105,6 +105,8 @@ namespace { } bool runOnFunction(Function &F); + const char *getPassName() const { return "CodeGen Prepare"; } + virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTree>(); AU.addPreserved<ProfileInfo>(); @@ -124,7 +126,7 @@ namespace { bool MoveExtToFormExtLoad(Instruction *I); bool OptimizeExtUses(Instruction *I); bool OptimizeSelectInst(SelectInst *SI); - bool DupRetToEnableTailCallOpts(ReturnInst *RI); + bool DupRetToEnableTailCallOpts(BasicBlock *BB); bool PlaceDbgValues(Function &F); }; } @@ -147,18 +149,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) { TLInfo = &getAnalysis<TargetLibraryInfo>(); DT = getAnalysisIfAvailable<DominatorTree>(); PFI = getAnalysisIfAvailable<ProfileInfo>(); - OptSize = F.hasFnAttr(Attribute::OptimizeForSize); + OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize); /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. if (TLI && TLI->isSlowDivBypassed()) { - const DenseMap<Type *, Type *> &BypassTypeMap = TLI->getBypassSlowDivTypes(); - - for (Function::iterator I = F.begin(); I != F.end(); I++) { - EverMadeChange |= bypassSlowDivision(F, - I, - BypassTypeMap); - } + const DenseMap<unsigned int, unsigned int> &BypassWidths = + TLI->getBypassSlowDivWidths(); + for (Function::iterator I = F.begin(); I != F.end(); I++) + EverMadeChange |= bypassSlowDivision(F, I, BypassWidths); } // Eliminate blocks that contain only PHI nodes and an @@ -173,7 +173,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { bool MadeChange = true; while (MadeChange) { MadeChange = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { + for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = I++; MadeChange |= OptimizeBlock(*BB); } @@ -196,9 +196,20 @@ bool CodeGenPrepare::runOnFunction(Function &F) { WorkList.insert(*II); } - for (SmallPtrSet<BasicBlock*, 8>::iterator - I = WorkList.begin(), E = WorkList.end(); I != E; ++I) - DeleteDeadBlock(*I); + // Delete the dead blocks and any of their dead successors. + MadeChange |= !WorkList.empty(); + while (!WorkList.empty()) { + BasicBlock *BB = *WorkList.begin(); + WorkList.erase(BB); + SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); + + DeleteDeadBlock(BB); + + for (SmallVectorImpl<BasicBlock*>::iterator + II = Successors.begin(), IE = Successors.end(); II != IE; ++II) + if (pred_begin(*II) == pred_end(*II)) + WorkList.insert(*II); + } // Merge pairs of basic blocks with unconditional branches, connected by // a single edge. @@ -228,7 +239,8 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) { // edge, just collapse it. BasicBlock *SinglePred = BB->getSinglePredecessor(); - if (!SinglePred || SinglePred == BB) continue; + // Don't merge if BB's address is taken. + if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); if (Term && !Term->isConditional()) { @@ -623,7 +635,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // happens. WeakVH IterHandle(CurInstIterator); - replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0, + replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getDataLayout() : 0, TLInfo, ModifiedDT ? 0 : DT); // If the iterator instruction was recursively deleted, start over at the @@ -647,8 +659,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // From here on out we're working with named functions. if (CI->getCalledFunction() == 0) return false; - // We'll need TargetData from here on out. - const TargetData *TD = TLI ? TLI->getTargetData() : 0; + // We'll need DataLayout from here on out. + const DataLayout *TD = TLI ? TLI->getDataLayout() : 0; if (!TD) return false; // Lower all default uses of _chk calls. This is very similar @@ -662,6 +674,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return /// instructions to the predecessor to enable tail call optimizations. The /// case it is currently looking for is: +/// @code /// bb0: /// %tmp0 = tail call i32 @f0() /// br label %return @@ -674,9 +687,11 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// return: /// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] /// ret i32 %retval +/// @endcode /// /// => /// +/// @code /// bb0: /// %tmp0 = tail call i32 @f0() /// ret i32 %tmp0 @@ -686,11 +701,15 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// bb2: /// %tmp2 = tail call i32 @f2() /// ret i32 %tmp2 -/// -bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { +/// @endcode +bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) { if (!TLI) return false; + ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()); + if (!RI) + return false; + PHINode *PN = 0; BitCastInst *BCI = 0; Value *V = RI->getReturnValue(); @@ -704,15 +723,15 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { return false; } - BasicBlock *BB = RI->getParent(); if (PN && PN->getParent() != BB) return false; // It's not safe to eliminate the sign / zero extension of the return value. // See llvm::isInTailCallPosition(). const Function *F = BB->getParent(); - Attributes CallerRetAttr = F->getAttributes().getRetAttributes(); - if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt)) + Attribute CallerRetAttr = F->getAttributes().getRetAttributes(); + if (CallerRetAttr.hasAttribute(Attribute::ZExt) || + CallerRetAttr.hasAttribute(Attribute::SExt)) return false; // Make sure there are no instructions between the PHI and return, or that the @@ -769,8 +788,11 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { // Conservatively require the attributes of the call to match those of the // return. Ignore noalias because it doesn't affect the call sequence. - Attributes CalleeRetAttr = CS.getAttributes().getRetAttributes(); - if ((CalleeRetAttr ^ CallerRetAttr) & ~Attribute::NoAlias) + Attribute CalleeRetAttr = CS.getAttributes().getRetAttributes(); + if (AttrBuilder(CalleeRetAttr). + removeAttribute(Attribute::NoAlias) != + AttrBuilder(CallerRetAttr). + removeAttribute(Attribute::NoAlias)) continue; // Make sure the call instruction is followed by an unconditional branch to @@ -787,7 +809,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { } // If we eliminated all predecessors of the block, delete the block now. - if (Changed && pred_begin(BB) == pred_end(BB)) + if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) BB->eraseFromParent(); return Changed; @@ -797,6 +819,629 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { // Memory Optimization //===----------------------------------------------------------------------===// +namespace { + +/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode +/// which holds actual Value*'s for register values. +struct ExtAddrMode : public TargetLowering::AddrMode { + Value *BaseReg; + Value *ScaledReg; + ExtAddrMode() : BaseReg(0), ScaledReg(0) {} + void print(raw_ostream &OS) const; + void dump() const; + + bool operator==(const ExtAddrMode& O) const { + return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) && + (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) && + (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale); + } +}; + +static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { + AM.print(OS); + return OS; +} + +void ExtAddrMode::print(raw_ostream &OS) const { + bool NeedPlus = false; + OS << "["; + if (BaseGV) { + OS << (NeedPlus ? " + " : "") + << "GV:"; + WriteAsOperand(OS, BaseGV, /*PrintType=*/false); + NeedPlus = true; + } + + if (BaseOffs) + OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true; + + if (BaseReg) { + OS << (NeedPlus ? " + " : "") + << "Base:"; + WriteAsOperand(OS, BaseReg, /*PrintType=*/false); + NeedPlus = true; + } + if (Scale) { + OS << (NeedPlus ? " + " : "") + << Scale << "*"; + WriteAsOperand(OS, ScaledReg, /*PrintType=*/false); + NeedPlus = true; + } + + OS << ']'; +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void ExtAddrMode::dump() const { + print(dbgs()); + dbgs() << '\n'; +} +#endif + + +/// \brief A helper class for matching addressing modes. +/// +/// This encapsulates the logic for matching the target-legal addressing modes. +class AddressingModeMatcher { + SmallVectorImpl<Instruction*> &AddrModeInsts; + const TargetLowering &TLI; + + /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and + /// the memory instruction that we're computing this address for. + Type *AccessTy; + Instruction *MemoryInst; + + /// AddrMode - This is the addressing mode that we're building up. This is + /// part of the return value of this addressing mode matching stuff. + ExtAddrMode &AddrMode; + + /// IgnoreProfitability - This is set to true when we should not do + /// profitability checks. When true, IsProfitableToFoldIntoAddressingMode + /// always returns true. + bool IgnoreProfitability; + + AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI, + const TargetLowering &T, Type *AT, + Instruction *MI, ExtAddrMode &AM) + : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM) { + IgnoreProfitability = false; + } +public: + + /// Match - Find the maximal addressing mode that a load/store of V can fold, + /// give an access type of AccessTy. This returns a list of involved + /// instructions in AddrModeInsts. + static ExtAddrMode Match(Value *V, Type *AccessTy, + Instruction *MemoryInst, + SmallVectorImpl<Instruction*> &AddrModeInsts, + const TargetLowering &TLI) { + ExtAddrMode Result; + + bool Success = + AddressingModeMatcher(AddrModeInsts, TLI, AccessTy, + MemoryInst, Result).MatchAddr(V, 0); + (void)Success; assert(Success && "Couldn't select *anything*?"); + return Result; + } +private: + bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); + bool MatchAddr(Value *V, unsigned Depth); + bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth); + bool IsProfitableToFoldIntoAddressingMode(Instruction *I, + ExtAddrMode &AMBefore, + ExtAddrMode &AMAfter); + bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); +}; + +/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode. +/// Return true and update AddrMode if this addr mode is legal for the target, +/// false if not. +bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, + unsigned Depth) { + // If Scale is 1, then this is the same as adding ScaleReg to the addressing + // mode. Just process that directly. + if (Scale == 1) + return MatchAddr(ScaleReg, Depth); + + // If the scale is 0, it takes nothing to add this. + if (Scale == 0) + return true; + + // If we already have a scale of this value, we can add to it, otherwise, we + // need an available scale field. + if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) + return false; + + ExtAddrMode TestAddrMode = AddrMode; + + // Add scale to turn X*4+X*3 -> X*7. This could also do things like + // [A+B + A*7] -> [B+A*8]. + TestAddrMode.Scale += Scale; + TestAddrMode.ScaledReg = ScaleReg; + + // If the new address isn't legal, bail out. + if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) + return false; + + // It was legal, so commit it. + AddrMode = TestAddrMode; + + // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now + // to see if ScaleReg is actually X+C. If so, we can turn this into adding + // X*Scale + C*Scale to addr mode. + ConstantInt *CI = 0; Value *AddLHS = 0; + if (isa<Instruction>(ScaleReg) && // not a constant expr. + match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { + TestAddrMode.ScaledReg = AddLHS; + TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; + + // If this addressing mode is legal, commit it and remember that we folded + // this instruction. + if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) { + AddrModeInsts.push_back(cast<Instruction>(ScaleReg)); + AddrMode = TestAddrMode; + return true; + } + } + + // Otherwise, not (x+c)*scale, just return what we have. + return true; +} + +/// MightBeFoldableInst - This is a little filter, which returns true if an +/// addressing computation involving I might be folded into a load/store +/// accessing it. This doesn't need to be perfect, but needs to accept at least +/// the set of instructions that MatchOperationAddr can. +static bool MightBeFoldableInst(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::BitCast: + // Don't touch identity bitcasts. + if (I->getType() == I->getOperand(0)->getType()) + return false; + return I->getType()->isPointerTy() || I->getType()->isIntegerTy(); + case Instruction::PtrToInt: + // PtrToInt is always a noop, as we know that the int type is pointer sized. + return true; + case Instruction::IntToPtr: + // We know the input is intptr_t, so this is foldable. + return true; + case Instruction::Add: + return true; + case Instruction::Mul: + case Instruction::Shl: + // Can only handle X*C and X << C. + return isa<ConstantInt>(I->getOperand(1)); + case Instruction::GetElementPtr: + return true; + default: + return false; + } +} + +/// MatchOperationAddr - Given an instruction or constant expr, see if we can +/// fold the operation into the addressing mode. If so, update the addressing +/// mode and return true, otherwise return false without modifying AddrMode. +bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, + unsigned Depth) { + // Avoid exponential behavior on extremely deep expression trees. + if (Depth >= 5) return false; + + switch (Opcode) { + case Instruction::PtrToInt: + // PtrToInt is always a noop, as we know that the int type is pointer sized. + return MatchAddr(AddrInst->getOperand(0), Depth); + case Instruction::IntToPtr: + // This inttoptr is a no-op if the integer type is pointer sized. + if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == + TLI.getPointerTy()) + return MatchAddr(AddrInst->getOperand(0), Depth); + return false; + case Instruction::BitCast: + // BitCast is always a noop, and we can handle it as long as it is + // int->int or pointer->pointer (we don't want int<->fp or something). + if ((AddrInst->getOperand(0)->getType()->isPointerTy() || + AddrInst->getOperand(0)->getType()->isIntegerTy()) && + // Don't touch identity bitcasts. These were probably put here by LSR, + // and we don't want to mess around with them. Assume it knows what it + // is doing. + AddrInst->getOperand(0)->getType() != AddrInst->getType()) + return MatchAddr(AddrInst->getOperand(0), Depth); + return false; + case Instruction::Add: { + // Check to see if we can merge in the RHS then the LHS. If so, we win. + ExtAddrMode BackupAddrMode = AddrMode; + unsigned OldSize = AddrModeInsts.size(); + if (MatchAddr(AddrInst->getOperand(1), Depth+1) && + MatchAddr(AddrInst->getOperand(0), Depth+1)) + return true; + + // Restore the old addr mode info. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + + // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. + if (MatchAddr(AddrInst->getOperand(0), Depth+1) && + MatchAddr(AddrInst->getOperand(1), Depth+1)) + return true; + + // Otherwise we definitely can't merge the ADD in. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + break; + } + //case Instruction::Or: + // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. + //break; + case Instruction::Mul: + case Instruction::Shl: { + // Can only handle X*C and X << C. + ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1)); + if (!RHS) return false; + int64_t Scale = RHS->getSExtValue(); + if (Opcode == Instruction::Shl) + Scale = 1LL << Scale; + + return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); + } + case Instruction::GetElementPtr: { + // Scan the GEP. We check it if it contains constant offsets and at most + // one variable offset. + int VariableOperand = -1; + unsigned VariableScale = 0; + + int64_t ConstantOffset = 0; + const DataLayout *TD = TLI.getDataLayout(); + gep_type_iterator GTI = gep_type_begin(AddrInst); + for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = TD->getStructLayout(STy); + unsigned Idx = + cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); + ConstantOffset += SL->getElementOffset(Idx); + } else { + uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType()); + if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { + ConstantOffset += CI->getSExtValue()*TypeSize; + } else if (TypeSize) { // Scales of zero don't do anything. + // We only allow one variable index at the moment. + if (VariableOperand != -1) + return false; + + // Remember the variable index. + VariableOperand = i; + VariableScale = TypeSize; + } + } + } + + // A common case is for the GEP to only do a constant offset. In this case, + // just add it to the disp field and check validity. + if (VariableOperand == -1) { + AddrMode.BaseOffs += ConstantOffset; + if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){ + // Check to see if we can fold the base pointer in too. + if (MatchAddr(AddrInst->getOperand(0), Depth+1)) + return true; + } + AddrMode.BaseOffs -= ConstantOffset; + return false; + } + + // Save the valid addressing mode in case we can't match. + ExtAddrMode BackupAddrMode = AddrMode; + unsigned OldSize = AddrModeInsts.size(); + + // See if the scale and offset amount is valid for this target. + AddrMode.BaseOffs += ConstantOffset; + + // Match the base operand of the GEP. + if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) { + // If it couldn't be matched, just stuff the value in a register. + if (AddrMode.HasBaseReg) { + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + return false; + } + AddrMode.HasBaseReg = true; + AddrMode.BaseReg = AddrInst->getOperand(0); + } + + // Match the remaining variable portion of the GEP. + if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, + Depth)) { + // If it couldn't be matched, try stuffing the base into a register + // instead of matching it, and retrying the match of the scale. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + if (AddrMode.HasBaseReg) + return false; + AddrMode.HasBaseReg = true; + AddrMode.BaseReg = AddrInst->getOperand(0); + AddrMode.BaseOffs += ConstantOffset; + if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), + VariableScale, Depth)) { + // If even that didn't work, bail. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + return false; + } + } + + return true; + } + } + return false; +} + +/// MatchAddr - If we can, try to add the value of 'Addr' into the current +/// addressing mode. If Addr can't be added to AddrMode this returns false and +/// leaves AddrMode unmodified. This assumes that Addr is either a pointer type +/// or intptr_t for the target. +/// +bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { + if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { + // Fold in immediates if legal for the target. + AddrMode.BaseOffs += CI->getSExtValue(); + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.BaseOffs -= CI->getSExtValue(); + } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { + // If this is a global variable, try to fold it into the addressing mode. + if (AddrMode.BaseGV == 0) { + AddrMode.BaseGV = GV; + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.BaseGV = 0; + } + } else if (Instruction *I = dyn_cast<Instruction>(Addr)) { + ExtAddrMode BackupAddrMode = AddrMode; + unsigned OldSize = AddrModeInsts.size(); + + // Check to see if it is possible to fold this operation. + if (MatchOperationAddr(I, I->getOpcode(), Depth)) { + // Okay, it's possible to fold this. Check to see if it is actually + // *profitable* to do so. We use a simple cost model to avoid increasing + // register pressure too much. + if (I->hasOneUse() || + IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { + AddrModeInsts.push_back(I); + return true; + } + + // It isn't profitable to do this, roll back. + //cerr << "NOT FOLDING: " << *I; + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + } + } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) { + if (MatchOperationAddr(CE, CE->getOpcode(), Depth)) + return true; + } else if (isa<ConstantPointerNull>(Addr)) { + // Null pointer gets folded without affecting the addressing mode. + return true; + } + + // Worse case, the target should support [reg] addressing modes. :) + if (!AddrMode.HasBaseReg) { + AddrMode.HasBaseReg = true; + AddrMode.BaseReg = Addr; + // Still check for legality in case the target supports [imm] but not [i+r]. + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.HasBaseReg = false; + AddrMode.BaseReg = 0; + } + + // If the base register is already taken, see if we can do [r+r]. + if (AddrMode.Scale == 0) { + AddrMode.Scale = 1; + AddrMode.ScaledReg = Addr; + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.Scale = 0; + AddrMode.ScaledReg = 0; + } + // Couldn't match. + return false; +} + +/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified +/// inline asm call are due to memory operands. If so, return true, otherwise +/// return false. +static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, + const TargetLowering &TLI) { + TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); + for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { + TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; + + // Compute the constraint code and ConstraintType to use. + TLI.ComputeConstraintToUse(OpInfo, SDValue()); + + // If this asm operand is our Value*, and if it isn't an indirect memory + // operand, we can't fold it! + if (OpInfo.CallOperandVal == OpVal && + (OpInfo.ConstraintType != TargetLowering::C_Memory || + !OpInfo.isIndirect)) + return false; + } + + return true; +} + +/// FindAllMemoryUses - Recursively walk all the uses of I until we find a +/// memory use. If we find an obviously non-foldable instruction, return true. +/// Add the ultimately found memory instructions to MemoryUses. +static bool FindAllMemoryUses(Instruction *I, + SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses, + SmallPtrSet<Instruction*, 16> &ConsideredInsts, + const TargetLowering &TLI) { + // If we already considered this instruction, we're done. + if (!ConsideredInsts.insert(I)) + return false; + + // If this is an obviously unfoldable instruction, bail out. + if (!MightBeFoldableInst(I)) + return true; + + // Loop over all the uses, recursively processing them. + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) { + User *U = *UI; + + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo())); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + unsigned opNo = UI.getOperandNo(); + if (opNo == 0) return true; // Storing addr, not into addr. + MemoryUses.push_back(std::make_pair(SI, opNo)); + continue; + } + + if (CallInst *CI = dyn_cast<CallInst>(U)) { + InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); + if (!IA) return true; + + // If this is a memory operand, we're cool, otherwise bail out. + if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) + return true; + continue; + } + + if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts, + TLI)) + return true; + } + + return false; +} + +/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at +/// the use site that we're folding it into. If so, there is no cost to +/// include it in the addressing mode. KnownLive1 and KnownLive2 are two values +/// that we know are live at the instruction already. +bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, + Value *KnownLive2) { + // If Val is either of the known-live values, we know it is live! + if (Val == 0 || Val == KnownLive1 || Val == KnownLive2) + return true; + + // All values other than instructions and arguments (e.g. constants) are live. + if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; + + // If Val is a constant sized alloca in the entry block, it is live, this is + // true because it is just a reference to the stack/frame pointer, which is + // live for the whole function. + if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) + if (AI->isStaticAlloca()) + return true; + + // Check to see if this value is already used in the memory instruction's + // block. If so, it's already live into the block at the very least, so we + // can reasonably fold it. + return Val->isUsedInBasicBlock(MemoryInst->getParent()); +} + +/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing +/// mode of the machine to fold the specified instruction into a load or store +/// that ultimately uses it. However, the specified instruction has multiple +/// uses. Given this, it may actually increase register pressure to fold it +/// into the load. For example, consider this code: +/// +/// X = ... +/// Y = X+1 +/// use(Y) -> nonload/store +/// Z = Y+1 +/// load Z +/// +/// In this case, Y has multiple uses, and can be folded into the load of Z +/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to +/// be live at the use(Y) line. If we don't fold Y into load Z, we use one +/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the +/// number of computations either. +/// +/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If +/// X was live across 'load Z' for other reasons, we actually *would* want to +/// fold the addressing mode in the Z case. This would make Y die earlier. +bool AddressingModeMatcher:: +IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, + ExtAddrMode &AMAfter) { + if (IgnoreProfitability) return true; + + // AMBefore is the addressing mode before this instruction was folded into it, + // and AMAfter is the addressing mode after the instruction was folded. Get + // the set of registers referenced by AMAfter and subtract out those + // referenced by AMBefore: this is the set of values which folding in this + // address extends the lifetime of. + // + // Note that there are only two potential values being referenced here, + // BaseReg and ScaleReg (global addresses are always available, as are any + // folded immediates). + Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; + + // If the BaseReg or ScaledReg was referenced by the previous addrmode, their + // lifetime wasn't extended by adding this instruction. + if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + BaseReg = 0; + if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + ScaledReg = 0; + + // If folding this instruction (and it's subexprs) didn't extend any live + // ranges, we're ok with it. + if (BaseReg == 0 && ScaledReg == 0) + return true; + + // If all uses of this instruction are ultimately load/store/inlineasm's, + // check to see if their addressing modes will include this instruction. If + // so, we can fold it into all uses, so it doesn't matter if it has multiple + // uses. + SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; + SmallPtrSet<Instruction*, 16> ConsideredInsts; + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) + return false; // Has a non-memory, non-foldable use! + + // Now that we know that all uses of this instruction are part of a chain of + // computation involving only operations that could theoretically be folded + // into a memory use, loop over each of these uses and see if they could + // *actually* fold the instruction. + SmallVector<Instruction*, 32> MatchedAddrModeInsts; + for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { + Instruction *User = MemoryUses[i].first; + unsigned OpNo = MemoryUses[i].second; + + // Get the access type of this use. If the use isn't a pointer, we don't + // know what it accesses. + Value *Address = User->getOperand(OpNo); + if (!Address->getType()->isPointerTy()) + return false; + Type *AddressAccessTy = + cast<PointerType>(Address->getType())->getElementType(); + + // Do a match against the root of this address, ignoring profitability. This + // will tell us if the addressing mode for the memory operation will + // *actually* cover the shared instruction. + ExtAddrMode Result; + AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy, + MemoryInst, Result); + Matcher.IgnoreProfitability = true; + bool Success = Matcher.MatchAddr(Address, 0); + (void)Success; assert(Success && "Couldn't select *anything*?"); + + // If the match didn't cover I, then it won't be shared by it. + if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(), + I) == MatchedAddrModeInsts.end()) + return false; + + MatchedAddrModeInsts.clear(); + } + + return true; +} + +} // end anonymous namespace + /// IsNonLocalValue - Return true if the specified values are defined in a /// different basic block than BB. static bool IsNonLocalValue(Value *V, BasicBlock *BB) { @@ -927,7 +1572,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); Type *IntPtrTy = - TLI->getTargetData()->getIntPtrType(AccessTy->getContext()); + TLI->getDataLayout()->getIntPtrType(AccessTy->getContext()); Value *Result = 0; @@ -1313,9 +1958,6 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { if (CallInst *CI = dyn_cast<CallInst>(I)) return OptimizeCallInst(CI); - if (ReturnInst *RI = dyn_cast<ReturnInst>(I)) - return DupRetToEnableTailCallOpts(RI); - if (SelectInst *SI = dyn_cast<SelectInst>(I)) return OptimizeSelectInst(SI); @@ -1330,9 +1972,11 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { bool MadeChange = false; CurInstIterator = BB.begin(); - for (BasicBlock::iterator E = BB.end(); CurInstIterator != E; ) + while (CurInstIterator != BB.end()) MadeChange |= OptimizeInst(CurInstIterator++); + MadeChange |= DupRetToEnableTailCallOpts(&BB); + return MadeChange; } diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index 5430f62..d5a96ec 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -20,14 +20,14 @@ #define DEBUG_TYPE "constprop" #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Constant.h" -#include "llvm/Instruction.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Support/InstIterator.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetLibraryInfo.h" #include <set> using namespace llvm; @@ -67,7 +67,7 @@ bool ConstantPropagation::runOnFunction(Function &F) { WorkList.insert(&*i); } bool Changed = false; - TargetData *TD = getAnalysisIfAvailable<TargetData>(); + DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); while (!WorkList.empty()) { diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 9b0aadb..4c3631b 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -13,15 +13,15 @@ #define DEBUG_TYPE "correlated-value-propagation" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/Pass.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" #include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumPhis, "Number of phis propagated"); @@ -235,6 +235,11 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { // This case never fires - remove it. CI.getCaseSuccessor()->removePredecessor(BB); SI->removeCase(CI); // Does not invalidate the iterator. + + // The condition can be modified by removePredecessor's PHI simplification + // logic. + Cond = SI->getCondition(); + ++NumDeadCases; Changed = true; } else if (State == LazyValueInfo::True) { diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp index 086f0a1..e8a090a 100644 --- a/lib/Transforms/Scalar/DCE.cpp +++ b/lib/Transforms/Scalar/DCE.cpp @@ -18,12 +18,12 @@ #define DEBUG_TYPE "dce" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Instruction.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Instruction.h" #include "llvm/Pass.h" #include "llvm/Support/InstIterator.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(DIEEliminated, "Number of insts removed by DIE pass"); @@ -118,13 +118,8 @@ bool DCE::runOnFunction(Function &F) { I->eraseFromParent(); // Remove the instruction from the worklist if it still exists in it. - for (std::vector<Instruction*>::iterator WI = WorkList.begin(); - WI != WorkList.end(); ) { - if (*WI == I) - WI = WorkList.erase(WI); - else - ++WI; - } + WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I), + WorkList.end()); MadeChange = true; ++DCEEliminated; diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index 1ff4329..fe3acbf 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -17,24 +17,25 @@ #define DEBUG_TYPE "dse" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/Function.h" -#include "llvm/GlobalVariable.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(NumFastStores, "Number of stores deleted"); @@ -45,6 +46,7 @@ namespace { AliasAnalysis *AA; MemoryDependenceAnalysis *MD; DominatorTree *DT; + const TargetLibraryInfo *TLI; static char ID; // Pass identification, replacement for typeid DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) { @@ -55,6 +57,7 @@ namespace { AA = &getAnalysis<AliasAnalysis>(); MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTree>(); + TLI = AA->getTargetLibraryInfo(); bool Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) @@ -144,7 +147,7 @@ static void DeleteDeadInstruction(Instruction *I, /// hasMemoryWrite - Does this instruction write some memory? This only returns /// true for things that we can analyze with other helpers below. -static bool hasMemoryWrite(Instruction *I) { +static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { if (isa<StoreInst>(I)) return true; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { @@ -159,6 +162,26 @@ static bool hasMemoryWrite(Instruction *I) { return true; } } + if (CallSite CS = I) { + if (Function *F = CS.getCalledFunction()) { + if (TLI && TLI->has(LibFunc::strcpy) && + F->getName() == TLI->getName(LibFunc::strcpy)) { + return true; + } + if (TLI && TLI->has(LibFunc::strncpy) && + F->getName() == TLI->getName(LibFunc::strncpy)) { + return true; + } + if (TLI && TLI->has(LibFunc::strcat) && + F->getName() == TLI->getName(LibFunc::strcat)) { + return true; + } + if (TLI && TLI->has(LibFunc::strncat) && + F->getName() == TLI->getName(LibFunc::strncat)) { + return true; + } + } + } return false; } @@ -176,7 +199,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // memset/memcpy, which writes more than an i8. - if (Loc.Size == AliasAnalysis::UnknownSize && AA.getTargetData() == 0) + if (Loc.Size == AliasAnalysis::UnknownSize && AA.getDataLayout() == 0) return AliasAnalysis::Location(); return Loc; } @@ -190,7 +213,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // init.trampoline, which writes more than an i8. - if (AA.getTargetData() == 0) return AliasAnalysis::Location(); + if (AA.getDataLayout() == 0) return AliasAnalysis::Location(); // FIXME: We don't know the size of the trampoline, so we can't really // handle it here. @@ -206,7 +229,8 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { /// instruction if any. static AliasAnalysis::Location getLocForRead(Instruction *Inst, AliasAnalysis &AA) { - assert(hasMemoryWrite(Inst) && "Unknown instruction case"); + assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) && + "Unknown instruction case"); // The only instructions that both read and write are the mem transfer // instructions (memcpy/memmove). @@ -223,23 +247,29 @@ static bool isRemovable(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->isUnordered(); - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate"); - case Intrinsic::lifetime_end: - // Never remove dead lifetime_end's, e.g. because it is followed by a - // free. - return false; - case Intrinsic::init_trampoline: - // Always safe to remove init_trampoline. - return true; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate"); + case Intrinsic::lifetime_end: + // Never remove dead lifetime_end's, e.g. because it is followed by a + // free. + return false; + case Intrinsic::init_trampoline: + // Always safe to remove init_trampoline. + return true; - case Intrinsic::memset: - case Intrinsic::memmove: - case Intrinsic::memcpy: - // Don't remove volatile memory intrinsics. - return !cast<MemIntrinsic>(II)->isVolatile(); + case Intrinsic::memset: + case Intrinsic::memmove: + case Intrinsic::memcpy: + // Don't remove volatile memory intrinsics. + return !cast<MemIntrinsic>(II)->isVolatile(); + } } + + if (CallSite CS = I) + return CS.getInstruction()->use_empty(); + + return false; } @@ -250,14 +280,19 @@ static bool isShortenable(Instruction *I) { if (isa<StoreInst>(I)) return false; - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: return false; - case Intrinsic::memset: - case Intrinsic::memcpy: - // Do shorten memory intrinsics. - return true; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::memset: + case Intrinsic::memcpy: + // Do shorten memory intrinsics. + return true; + } } + + // Don't shorten libcalls calls for now. + + return false; } /// getStoredPointerOperand - Return the pointer that is being written to. @@ -267,17 +302,23 @@ static Value *getStoredPointerOperand(Instruction *I) { if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) return MI->getDest(); - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::init_trampoline: - return II->getArgOperand(0); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::init_trampoline: + return II->getArgOperand(0); + } } + + CallSite CS = I; + // All the supported functions so far happen to have dest as their first + // argument. + return CS.getArgument(0); } static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) { uint64_t Size; - if (getObjectSize(V, Size, AA.getTargetData(), AA.getTargetLibraryInfo())) + if (getObjectSize(V, Size, AA.getDataLayout(), AA.getTargetLibraryInfo())) return Size; return AliasAnalysis::UnknownSize; } @@ -310,10 +351,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // comparison. if (Later.Size == AliasAnalysis::UnknownSize || Earlier.Size == AliasAnalysis::UnknownSize) { - // If we have no TargetData information around, then the size of the store + // If we have no DataLayout information around, then the size of the store // is inferrable from the pointee type. If they are the same type, then // we know that the store is safe. - if (AA.getTargetData() == 0 && + if (AA.getDataLayout() == 0 && Later.Ptr->getType() == Earlier.Ptr->getType()) return OverwriteComplete; @@ -329,13 +370,13 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // larger than the earlier one. if (Later.Size == AliasAnalysis::UnknownSize || Earlier.Size == AliasAnalysis::UnknownSize || - AA.getTargetData() == 0) + AA.getDataLayout() == 0) return OverwriteUnknown; // Check to see if the later store is to the entire object (either a global, // an alloca, or a byval argument). If so, then it clearly overwrites any // other store to the same object. - const TargetData &TD = *AA.getTargetData(); + const DataLayout &TD = *AA.getDataLayout(); const Value *UO1 = GetUnderlyingObject(P1, &TD), *UO2 = GetUnderlyingObject(P2, &TD); @@ -455,13 +496,13 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { Instruction *Inst = BBI++; // Handle 'free' calls specially. - if (CallInst *F = isFreeCall(Inst, AA->getTargetLibraryInfo())) { + if (CallInst *F = isFreeCall(Inst, TLI)) { MadeChange |= HandleFree(F); continue; } // If we find something that writes memory, get its memory dependence. - if (!hasMemoryWrite(Inst)) + if (!hasMemoryWrite(Inst, TLI)) continue; MemDepResult InstDep = MD->getDependency(Inst); @@ -484,7 +525,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // in case we need it. WeakVH NextInst(BBI); - DeleteDeadInstruction(SI, *MD, AA->getTargetLibraryInfo()); + DeleteDeadInstruction(SI, *MD, TLI); if (NextInst == 0) // Next instruction deleted. BBI = BB.begin(); @@ -531,7 +572,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { << *DepWrite << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. - DeleteDeadInstruction(DepWrite, *MD, AA->getTargetLibraryInfo()); + DeleteDeadInstruction(DepWrite, *MD, TLI); ++NumFastStores; MadeChange = true; @@ -628,7 +669,7 @@ bool DSE::HandleFree(CallInst *F) { MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); - if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) + if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency)) break; Value *DepPointer = @@ -641,7 +682,7 @@ bool DSE::HandleFree(CallInst *F) { Instruction *Next = llvm::next(BasicBlock::iterator(Dependency)); // DCE instructions only used to calculate that store - DeleteDeadInstruction(Dependency, *MD, AA->getTargetLibraryInfo()); + DeleteDeadInstruction(Dependency, *MD, TLI); ++NumFastStores; MadeChange = true; @@ -660,6 +701,22 @@ bool DSE::HandleFree(CallInst *F) { return MadeChange; } +namespace { + struct CouldRef { + typedef Value *argument_type; + const CallSite CS; + AliasAnalysis *AA; + + bool operator()(Value *I) { + // See if the call site touches the value. + AliasAnalysis::ModRefResult A = + AA->getModRefInfo(CS, I, getPointerSize(I, *AA)); + + return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; + } + }; +} + /// handleEndBlock - Remove dead stores to stack-allocated locations in the /// function end block. Ex: /// %A = alloca i32 @@ -681,8 +738,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Okay, so these are dead heap objects, but if the pointer never escapes // then it's leaked by this function anyways. - else if (isAllocLikeFn(I, AA->getTargetLibraryInfo()) && - !PointerMayBeCaptured(I, true, true)) + else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true)) DeadStackObjects.insert(I); } @@ -698,7 +754,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { --BBI; // If we find a store, check to see if it points into a dead stack value. - if (hasMemoryWrite(BBI) && isRemovable(BBI)) { + if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) { // See through pointer-to-pointer bitcasts SmallVector<Value *, 4> Pointers; GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers); @@ -726,8 +782,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { dbgs() << '\n'); // DCE instructions only used to calculate that store. - DeleteDeadInstruction(Dead, *MD, AA->getTargetLibraryInfo(), - &DeadStackObjects); + DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects); ++NumFastStores; MadeChange = true; continue; @@ -735,10 +790,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } // Remove any dead non-memory-mutating instructions. - if (isInstructionTriviallyDead(BBI, AA->getTargetLibraryInfo())) { + if (isInstructionTriviallyDead(BBI, TLI)) { Instruction *Inst = BBI++; - DeleteDeadInstruction(Inst, *MD, AA->getTargetLibraryInfo(), - &DeadStackObjects); + DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects); ++NumFastOther; MadeChange = true; continue; @@ -754,7 +808,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (CallSite CS = cast<Value>(BBI)) { // Remove allocation function calls from the list of dead stack objects; // there can't be any references before the definition. - if (isAllocLikeFn(BBI, AA->getTargetLibraryInfo())) + if (isAllocLikeFn(BBI, TLI)) DeadStackObjects.remove(BBI); // If this call does not access memory, it can't be loading any of our @@ -764,26 +818,14 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // If the call might load from any of our allocas, then any store above // the call is live. - SmallVector<Value*, 8> LiveAllocas; - for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), - E = DeadStackObjects.end(); I != E; ++I) { - // See if the call site touches it. - AliasAnalysis::ModRefResult A = - AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA)); - - if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref) - LiveAllocas.push_back(*I); - } + CouldRef Pred = { CS, AA }; + DeadStackObjects.remove_if(Pred); // If all of the allocas were clobbered by the call then we're not going // to find anything else to process. - if (DeadStackObjects.size() == LiveAllocas.size()) + if (DeadStackObjects.empty()) break; - for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), - E = LiveAllocas.end(); I != E; ++I) - DeadStackObjects.remove(*I); - continue; } @@ -820,6 +862,20 @@ bool DSE::handleEndBlock(BasicBlock &BB) { return MadeChange; } +namespace { + struct CouldAlias { + typedef Value *argument_type; + const AliasAnalysis::Location &LoadedLoc; + AliasAnalysis *AA; + + bool operator()(Value *I) { + // See if the loaded location could alias the stack location. + AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA)); + return !AA->isNoAlias(StackLoc, LoadedLoc); + } + }; +} + /// RemoveAccessedObjects - Check to see if the specified location may alias any /// of the stack objects in the DeadStackObjects set. If so, they become live /// because the location is being loaded. @@ -838,16 +894,7 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, return; } - SmallVector<Value*, 16> NowLive; - for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), - E = DeadStackObjects.end(); I != E; ++I) { - // See if the loaded location could alias the stack location. - AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA)); - if (!AA->isNoAlias(StackLoc, LoadedLoc)) - NowLive.push_back(*I); - } - - for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end(); - I != E; ++I) - DeadStackObjects.remove(*I); + // Remove objects that could alias LoadedLoc. + CouldAlias Pred = { LoadedLoc, AA }; + DeadStackObjects.remove_if(Pred); } diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 2627113..3c08634 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -14,17 +14,18 @@ #define DEBUG_TYPE "early-cse" #include "llvm/Transforms/Scalar.h" -#include "llvm/Instructions.h" -#include "llvm/Pass.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/ScopedHashTable.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" -#include "llvm/ADT/ScopedHashTable.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include <deque> using namespace llvm; @@ -90,35 +91,56 @@ template<> struct DenseMapInfo<SimpleValue> { unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { Instruction *Inst = Val.Inst; - // Hash in all of the operands as pointers. - unsigned Res = 0; - for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) - Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); + if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) { + Value *LHS = BinOp->getOperand(0); + Value *RHS = BinOp->getOperand(1); + if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1)) + std::swap(LHS, RHS); + + if (isa<OverflowingBinaryOperator>(BinOp)) { + // Hash the overflow behavior + unsigned Overflow = + BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | + BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap; + return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS); + } - if (CastInst *CI = dyn_cast<CastInst>(Inst)) - Res ^= getHash(CI->getType()); - else if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) - Res ^= CI->getPredicate(); - else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) { - for (ExtractValueInst::idx_iterator I = EVI->idx_begin(), - E = EVI->idx_end(); I != E; ++I) - Res ^= *I; - } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) { - for (InsertValueInst::idx_iterator I = IVI->idx_begin(), - E = IVI->idx_end(); I != E; ++I) - Res ^= *I; - } else { - // nothing extra to hash in. - assert((isa<CallInst>(Inst) || - isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || - isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || - isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst)) && - "Invalid/unknown instruction"); + return hash_combine(BinOp->getOpcode(), LHS, RHS); } + if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) { + Value *LHS = CI->getOperand(0); + Value *RHS = CI->getOperand(1); + CmpInst::Predicate Pred = CI->getPredicate(); + if (Inst->getOperand(0) > Inst->getOperand(1)) { + std::swap(LHS, RHS); + Pred = CI->getSwappedPredicate(); + } + return hash_combine(Inst->getOpcode(), Pred, LHS, RHS); + } + + if (CastInst *CI = dyn_cast<CastInst>(Inst)) + return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0)); + + if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) + return hash_combine(EVI->getOpcode(), EVI->getOperand(0), + hash_combine_range(EVI->idx_begin(), EVI->idx_end())); + + if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) + return hash_combine(IVI->getOpcode(), IVI->getOperand(0), + IVI->getOperand(1), + hash_combine_range(IVI->idx_begin(), IVI->idx_end())); + + assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) || + isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) || + isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || + isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction"); + // Mix in the opcode. - return (Res << 1) ^ Inst->getOpcode(); + return hash_combine(Inst->getOpcode(), + hash_combine_range(Inst->value_op_begin(), + Inst->value_op_end())); } bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { @@ -128,7 +150,41 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { return LHSI == RHSI; if (LHSI->getOpcode() != RHSI->getOpcode()) return false; - return LHSI->isIdenticalTo(RHSI); + if (LHSI->isIdenticalTo(RHSI)) return true; + + // If we're not strictly identical, we still might be a commutable instruction + if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) { + if (!LHSBinOp->isCommutative()) + return false; + + assert(isa<BinaryOperator>(RHSI) + && "same opcode, but different instruction type?"); + BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI); + + // Check overflow attributes + if (isa<OverflowingBinaryOperator>(LHSBinOp)) { + assert(isa<OverflowingBinaryOperator>(RHSBinOp) + && "same opcode, but different operator type?"); + if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() || + LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap()) + return false; + } + + // Commuted equality + return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) && + LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); + } + if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) { + assert(isa<CmpInst>(RHSI) + && "same opcode, but different instruction type?"); + CmpInst *RHSCmp = cast<CmpInst>(RHSI); + // Commuted equality + return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) && + LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && + LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); + } + + return false; } //===----------------------------------------------------------------------===// @@ -216,7 +272,7 @@ namespace { /// cases. class EarlyCSE : public FunctionPass { public: - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; DominatorTree *DT; typedef RecyclingAllocator<BumpPtrAllocator, @@ -274,7 +330,8 @@ private: CallScope(*availableCalls) {} private: - NodeScope(const NodeScope&); // DO NOT IMPLEMENT + NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION; + void operator=(const NodeScope&) LLVM_DELETED_FUNCTION; ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; @@ -313,7 +370,8 @@ private: void process() { Processed = true; } private: - StackNode(const StackNode&); // DO NOT IMPLEMENT + StackNode(const StackNode&) LLVM_DELETED_FUNCTION; + void operator=(const StackNode&) LLVM_DELETED_FUNCTION; // Members. unsigned CurrentGeneration; @@ -506,7 +564,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { bool EarlyCSE::runOnFunction(Function &F) { std::deque<StackNode *> nodesToProcess; - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); DT = &getAnalysis<DominatorTree>(); diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 16ae6ad..14201b9 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -17,11 +17,6 @@ #define DEBUG_TYPE "gvn" #include "llvm/Transforms/Scalar.h" -#include "llvm/GlobalVariable.h" -#include "llvm/IRBuilder.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Metadata.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" @@ -37,11 +32,16 @@ #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Assembly/Writer.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/PatternMatch.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -503,7 +503,7 @@ namespace { bool NoLoads; MemoryDependenceAnalysis *MD; DominatorTree *DT; - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; ValueTable VN; @@ -535,7 +535,7 @@ namespace { InstrsToErase.push_back(I); } - const TargetData *getTargetData() const { return TD; } + const DataLayout *getDataLayout() const { return TD; } DominatorTree &getDominatorTree() const { return *DT; } AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceAnalysis &getMemDep() const { return *MD; } @@ -632,7 +632,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void GVN::dump(DenseMap<uint32_t, Value*>& d) { errs() << "{\n"; for (DenseMap<uint32_t, Value*>::iterator I = d.begin(), @@ -730,7 +730,7 @@ SpeculationFailure: /// CoerceAvailableValueToLoadType will succeed. static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, - const TargetData &TD) { + const DataLayout &TD) { // If the loaded or stored value is an first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy() || @@ -746,7 +746,6 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, return true; } - /// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and /// then a load from a must-aliased pointer of a different type, try to coerce /// the stored value. LoadedTy is the type of the load we want to replace and @@ -756,7 +755,7 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, Instruction *InsertPt, - const TargetData &TD) { + const DataLayout &TD) { if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) return 0; @@ -769,24 +768,25 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { // Pointer to Pointer -> use bitcast. - if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) + if (StoredValTy->getScalarType()->isPointerTy() && + LoadedTy->getScalarType()->isPointerTy()) return new BitCastInst(StoredVal, LoadedTy, "", InsertPt); // Convert source pointers to integers, which can be bitcast. - if (StoredValTy->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + if (StoredValTy->getScalarType()->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } Type *TypeToCastTo = LoadedTy; - if (TypeToCastTo->isPointerTy()) - TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext()); + if (TypeToCastTo->getScalarType()->isPointerTy()) + TypeToCastTo = TD.getIntPtrType(TypeToCastTo); if (StoredValTy != TypeToCastTo) StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt); // Cast to pointer if the load needs a pointer type. - if (LoadedTy->isPointerTy()) + if (LoadedTy->getScalarType()->isPointerTy()) StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt); return StoredVal; @@ -798,8 +798,8 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail"); // Convert source pointers to integers, which can be manipulated. - if (StoredValTy->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + if (StoredValTy->getScalarType()->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } @@ -824,7 +824,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, return StoredVal; // If the result is a pointer, inttoptr. - if (LoadedTy->isPointerTy()) + if (LoadedTy->getScalarType()->isPointerTy()) return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt); // Otherwise, bitcast. @@ -842,7 +842,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, Value *WritePtr, uint64_t WriteSizeInBits, - const TargetData &TD) { + const DataLayout &TD) { // If the loaded or stored value is a first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy()) @@ -915,7 +915,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, /// memdep query of a load that ends up being a clobbering store. static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, - const TargetData &TD) { + const DataLayout &TD) { // Cannot handle reading from store of first-class aggregate yet. if (DepSI->getValueOperand()->getType()->isStructTy() || DepSI->getValueOperand()->getType()->isArrayTy()) @@ -931,7 +931,7 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, - LoadInst *DepLI, const TargetData &TD){ + LoadInst *DepLI, const DataLayout &TD){ // Cannot handle reading from store of first-class aggregate yet. if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) return -1; @@ -959,7 +959,7 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, MemIntrinsic *MI, - const TargetData &TD) { + const DataLayout &TD) { // If the mem operation is a non-constant size, we can't handle it. ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength()); if (SizeCst == 0) return -1; @@ -1009,7 +1009,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, /// before we give up. static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, - Instruction *InsertPt, const TargetData &TD){ + Instruction *InsertPt, const DataLayout &TD){ LLVMContext &Ctx = SrcVal->getType()->getContext(); uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; @@ -1019,8 +1019,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, // Compute which bits of the stored value are being used by the load. Convert // to an integer type to start with. - if (SrcVal->getType()->isPointerTy()) - SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx)); + if (SrcVal->getType()->getScalarType()->isPointerTy()) + SrcVal = Builder.CreatePtrToInt(SrcVal, + TD.getIntPtrType(SrcVal->getType())); if (!SrcVal->getType()->isIntegerTy()) SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); @@ -1048,7 +1049,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, GVN &gvn) { - const TargetData &TD = *gvn.getTargetData(); + const DataLayout &TD = *gvn.getDataLayout(); // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to // widen SrcVal out to a larger load. unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType()); @@ -1107,7 +1108,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Type *LoadTy, Instruction *InsertPt, - const TargetData &TD){ + const DataLayout &TD){ LLVMContext &Ctx = LoadTy->getContext(); uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; @@ -1231,7 +1232,7 @@ struct AvailableValueInBlock { if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - const TargetData *TD = gvn.getTargetData(); + const DataLayout *TD = gvn.getDataLayout(); assert(TD && "Need target data to handle type mismatch case"); Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), *TD); @@ -1253,7 +1254,7 @@ struct AvailableValueInBlock { << *Res << '\n' << "\n\n\n"); } } else { - const TargetData *TD = gvn.getTargetData(); + const DataLayout *TD = gvn.getDataLayout(); assert(TD && "Need target data to handle type mismatch case"); Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, BB->getTerminator(), *TD); @@ -1301,7 +1302,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); // If new PHI nodes were created, notify alias analysis. - if (V->getType()->isPointerTy()) { + if (V->getType()->getScalarType()->isPointerTy()) { AliasAnalysis *AA = gvn.getAliasAnalysis(); for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) @@ -1498,7 +1499,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { if (isa<PHINode>(V)) V->takeName(LI); - if (V->getType()->isPointerTy()) + if (V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(LI); ++NumGVNLoad; @@ -1730,7 +1731,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { LI->replaceAllUsesWith(V); if (isa<PHINode>(V)) V->takeName(LI); - if (V->getType()->isPointerTy()) + if (V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(LI); ++NumPRELoad; @@ -1857,7 +1858,7 @@ bool GVN::processLoad(LoadInst *L) { // Replace the load! L->replaceAllUsesWith(AvailVal); - if (AvailVal->getType()->isPointerTy()) + if (AvailVal->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(AvailVal); markInstructionForDeletion(L); ++NumGVNLoad; @@ -1914,7 +1915,7 @@ bool GVN::processLoad(LoadInst *L) { // Remove it! L->replaceAllUsesWith(StoredVal); - if (StoredVal->getType()->isPointerTy()) + if (StoredVal->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(StoredVal); markInstructionForDeletion(L); ++NumGVNLoad; @@ -1943,7 +1944,7 @@ bool GVN::processLoad(LoadInst *L) { // Remove it! patchAndReplaceAllUsesWith(AvailableVal, L); - if (DepLI->getType()->isPointerTy()) + if (DepLI->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(DepLI); markInstructionForDeletion(L); ++NumGVNLoad; @@ -2184,7 +2185,7 @@ bool GVN::processInstruction(Instruction *I) { // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) { I->replaceAllUsesWith(V); - if (MD && V->getType()->isPointerTy()) + if (MD && V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(I); ++NumGVNSimpl; @@ -2284,7 +2285,7 @@ bool GVN::processInstruction(Instruction *I) { // Remove it! patchAndReplaceAllUsesWith(repl, I); - if (MD && repl->getType()->isPointerTy()) + if (MD && repl->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(repl); markInstructionForDeletion(I); return true; @@ -2295,7 +2296,7 @@ bool GVN::runOnFunction(Function& F) { if (!NoLoads) MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); @@ -2532,7 +2533,7 @@ bool GVN::performPRE(Function &F) { addToLeaderTable(ValNo, Phi, CurrentBlock); Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); - if (Phi->getType()->isPointerTy()) { + if (Phi->getType()->getScalarType()->isPointerTy()) { // Because we have added a PHI-use of the pointer value, it has now // "escaped" from alias analysis' perspective. We need to inform // AA of this. diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp index b36a3cb..1601a8d 100644 --- a/lib/Transforms/Scalar/GlobalMerge.cpp +++ b/lib/Transforms/Scalar/GlobalMerge.cpp @@ -53,19 +53,19 @@ #define DEBUG_TYPE "global-merge" #include "llvm/Transforms/Scalar.h" -#include "llvm/Attributes.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/GlobalVariable.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" -#include "llvm/Module.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumMerged , "Number of globals merged"); @@ -76,7 +76,7 @@ namespace { const TargetLowering *TLI; bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst) const; + Module &M, bool isConst, unsigned AddrSpace) const; public: static char ID; // Pass identification, replacement for typeid. @@ -98,9 +98,9 @@ namespace { } struct GlobalCmp { - const TargetData *TD; + const DataLayout *TD; - GlobalCmp(const TargetData *td) : TD(td) { } + GlobalCmp(const DataLayout *td) : TD(td) { } bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) { Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); @@ -118,8 +118,8 @@ INITIALIZE_PASS(GlobalMerge, "global-merge", bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, - Module &M, bool isConst) const { - const TargetData *TD = TLI->getTargetData(); + Module &M, bool isConst, unsigned AddrSpace) const { + const DataLayout *TD = TLI->getDataLayout(); // FIXME: Infer the maximum possible offset depending on the actual users // (these max offsets are different for the users inside Thumb or ARM @@ -150,7 +150,9 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst, GlobalValue::InternalLinkage, - MergedInit, "_MergedGlobals"); + MergedInit, "_MergedGlobals", + 0, GlobalVariable::NotThreadLocal, + AddrSpace); for (size_t k = i; k < j; ++k) { Constant *Idx[2] = { ConstantInt::get(Int32Ty, 0), @@ -169,8 +171,9 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, bool GlobalMerge::doInitialization(Module &M) { - SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals; - const TargetData *TD = TLI->getTargetData(); + DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals, + BSSGlobals; + const DataLayout *TD = TLI->getDataLayout(); unsigned MaxOffset = TLI->getMaximalGlobalOffset(); bool Changed = false; @@ -181,6 +184,11 @@ bool GlobalMerge::doInitialization(Module &M) { if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection()) continue; + PointerType *PT = dyn_cast<PointerType>(I->getType()); + assert(PT && "Global variable is not a pointer!"); + + unsigned AddressSpace = PT->getAddressSpace(); + // Ignore fancy-aligned globals for now. unsigned Alignment = TD->getPreferredAlignment(I); Type *Ty = I->getType()->getElementType(); @@ -195,18 +203,23 @@ bool GlobalMerge::doInitialization(Module &M) { if (TD->getTypeAllocSize(Ty) < MaxOffset) { if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine()) .isBSSLocal()) - BSSGlobals.push_back(I); + BSSGlobals[AddressSpace].push_back(I); else if (I->isConstant()) - ConstGlobals.push_back(I); + ConstGlobals[AddressSpace].push_back(I); else - Globals.push_back(I); + Globals[AddressSpace].push_back(I); } } - if (Globals.size() > 1) - Changed |= doMerge(Globals, M, false); - if (BSSGlobals.size() > 1) - Changed |= doMerge(BSSGlobals, M, false); + for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator + I = Globals.begin(), E = Globals.end(); I != E; ++I) + if (I->second.size() > 1) + Changed |= doMerge(I->second, M, false, I->first); + + for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator + I = BSSGlobals.begin(), E = BSSGlobals.end(); I != E; ++I) + if (I->second.size() > 1) + Changed |= doMerge(I->second, M, false, I->first); // FIXME: This currently breaks the EH processing due to way how the // typeinfo detection works. We might want to detect the TIs and ignore diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index c933a17..97fff7e 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -26,28 +26,28 @@ #define DEBUG_TYPE "indvars" #include "llvm/Transforms/Scalar.h" -#include "llvm/BasicBlock.h" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Type.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Type.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumWidened , "Number of indvars widened"); @@ -68,7 +68,7 @@ namespace { LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; - TargetData *TD; + DataLayout *TD; TargetLibraryInfo *TLI; SmallVector<WeakVH, 16> DeadInsts; @@ -220,8 +220,6 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def, /// ConvertToSInt - Convert APF to an integer, if possible. static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { bool isExact = false; - if (&APF.getSemantics() == &APFloat::PPCDoubleDouble) - return false; // See if we can convert this to an int64_t uint64_t UIntVal; if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero, @@ -551,15 +549,17 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { PN->setIncomingValue(i, ExitVal); - // If this instruction is dead now, delete it. - RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI); + // If this instruction is dead now, delete it. Don't do it now to avoid + // invalidating iterators. + if (isInstructionTriviallyDead(Inst, TLI)) + DeadInsts.push_back(Inst); if (NumPreds == 1) { // Completely replace a single-pred PHI. This is safe, because the // NewVal won't be variant in the loop, so we don't need an LCSSA phi // node anymore. PN->replaceAllUsesWith(ExitVal); - RecursivelyDeleteTriviallyDeadInstructions(PN, TLI); + PN->eraseFromParent(); } } if (NumPreds != 1) { @@ -597,13 +597,13 @@ namespace { class WideIVVisitor : public IVVisitor { ScalarEvolution *SE; - const TargetData *TD; + const DataLayout *TD; public: WideIVInfo WI; WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV, - const TargetData *TData) : + const DataLayout *TData) : SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; } // Implement the interface used by simplifyUsersOfIV. @@ -1261,8 +1261,13 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) { if (!Phi) return true; + // Do LFTR if PHI node is defined in the loop, but is *not* a counter. + int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); + if (Idx < 0) + return true; + // Do LFTR if the exit condition's IV is *not* a simple counter. - Value *IncV = Phi->getIncomingValueForBlock(L->getLoopLatch()); + Value *IncV = Phi->getIncomingValue(Idx); return Phi != getLoopPhiForCounter(IncV, L, DT); } @@ -1341,7 +1346,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { /// could at least handle constant BECounts. static PHINode * FindLoopCounter(Loop *L, const SCEV *BECount, - ScalarEvolution *SE, DominatorTree *DT, const TargetData *TD) { + ScalarEvolution *SE, DominatorTree *DT, const DataLayout *TD) { uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); Value *Cond = @@ -1698,7 +1703,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); DeadInsts.clear(); diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 20844c6..b61c5ba 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -13,28 +13,28 @@ #define DEBUG_TYPE "jump-threading" #include "llvm/Transforms/Scalar.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Pass.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LazyValueInfo.h" -#include "llvm/Analysis/Loads.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; STATISTIC(NumThreads, "Number of jumps threaded"); @@ -75,7 +75,7 @@ namespace { /// revectored to the false side of the second if. /// class JumpThreading : public FunctionPass { - TargetData *TD; + DataLayout *TD; TargetLibraryInfo *TLI; LazyValueInfo *LVI; #ifdef NDEBUG @@ -147,7 +147,7 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } /// bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); LVI = &getAnalysis<LazyValueInfo>(); @@ -216,19 +216,24 @@ bool JumpThreading::runOnFunction(Function &F) { } /// getJumpThreadDuplicationCost - Return the cost of duplicating this block to -/// thread across it. -static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { +/// thread across it. Stop scanning the block when passing the threshold. +static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, + unsigned Threshold) { /// Ignore PHI nodes, these will be flattened when duplication happens. BasicBlock::const_iterator I = BB->getFirstNonPHI(); // FIXME: THREADING will delete values that are just used to compute the // branch, so they shouldn't count against the duplication cost. - // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; for (; !isa<TerminatorInst>(I); ++I) { + + // Stop scanning the block if we've reached the threshold. + if (Size > Threshold) + return Size; + // Debugger intrinsics don't incur code size. if (isa<DbgInfoIntrinsic>(I)) continue; @@ -244,7 +249,11 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast<CallInst>(I)) { - if (!isa<IntrinsicInst>(CI)) + if (CI->hasFnAttr(Attribute::NoDuplicate)) + // Blocks with NoDuplicate are modelled as having infinite cost, so they + // are never duplicated. + return ~0U; + else if (!isa<IntrinsicInst>(CI)) Size += 3; else if (!CI->getType()->isVectorTy()) Size += 1; @@ -1337,7 +1346,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, return false; } - unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB); + unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, Threshold); if (JumpThreadCost > Threshold) { DEBUG(dbgs() << " Not threading BB '" << BB->getName() << "' - Cost is too high: " << JumpThreadCost << "\n"); @@ -1481,7 +1490,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, return false; } - unsigned DuplicationCost = getJumpThreadDuplicationCost(BB); + unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, Threshold); if (DuplicationCost > Threshold) { DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() << "' - Cost is too high: " << DuplicationCost << "\n"); diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 99bedce..dc6bef7 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -32,27 +32,28 @@ #define DEBUG_TYPE "licm" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/Debug.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> using namespace llvm; @@ -90,6 +91,8 @@ namespace { AU.addRequired<TargetLibraryInfo>(); } + using llvm::Pass::doFinalization; + bool doFinalization() { assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets"); return false; @@ -100,7 +103,7 @@ namespace { LoopInfo *LI; // Current LoopInfo DominatorTree *DT; // Dominator Tree for the current Loop. - TargetData *TD; // TargetData for constant folding. + DataLayout *TD; // DataLayout for constant folding. TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. // State that is updated as we process loops. @@ -207,7 +210,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { AA = &getAnalysis<AliasAnalysis>(); DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); CurAST = new AliasSetTracker(*AA); @@ -663,16 +666,18 @@ namespace { AliasSetTracker &AST; DebugLoc DL; int Alignment; + MDNode *TBAATag; public: LoopPromoter(Value *SP, const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, SmallPtrSet<Value*, 4> &PMA, SmallVectorImpl<BasicBlock*> &LEB, SmallVectorImpl<Instruction*> &LIP, - AliasSetTracker &ast, DebugLoc dl, int alignment) + AliasSetTracker &ast, DebugLoc dl, int alignment, + MDNode *TBAATag) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), - AST(ast), DL(dl), Alignment(alignment) {} + AST(ast), DL(dl), Alignment(alignment), TBAATag(TBAATag) {} virtual bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &) const { @@ -696,6 +701,7 @@ namespace { StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos); NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); + if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag); } } @@ -749,10 +755,11 @@ void LICM::PromoteAliasSet(AliasSet &AS, // We start with an alignment of one and try to find instructions that allow // us to prove better alignment. unsigned Alignment = 1; + MDNode *TBAATag = 0; // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in - // different sizes. + // different sizes. While we are at it, collect alignment and TBAA info. for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) { Value *ASIV = ASI->getValue(); PointerMustAliases.insert(ASIV); @@ -794,8 +801,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, // instruction will be executed, update the alignment. // Larger is better, with the exception of 0 being the best alignment. unsigned InstAlignment = store->getAlignment(); - if ((InstAlignment > Alignment || InstAlignment == 0) - && (Alignment != 0)) + if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0) if (isGuaranteedToExecute(*Use)) { GuaranteedToExecute = true; Alignment = InstAlignment; @@ -807,6 +813,15 @@ void LICM::PromoteAliasSet(AliasSet &AS, } else return; // Not a load or store. + // Merge the TBAA tags. + if (LoopUses.empty()) { + // On the first load/store, just take its TBAA tag. + TBAATag = Use->getMetadata(LLVMContext::MD_tbaa); + } else if (TBAATag) { + TBAATag = MDNode::getMostGenericTBAA(TBAATag, + Use->getMetadata(LLVMContext::MD_tbaa)); + } + LoopUses.push_back(Use); } } @@ -839,7 +854,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, - InsertPts, *CurAST, DL, Alignment); + InsertPts, *CurAST, DL, Alignment, TBAATag); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. @@ -848,6 +863,7 @@ void LICM::PromoteAliasSet(AliasSet &AS, Preheader->getTerminator()); PreheaderLoad->setAlignment(Alignment); PreheaderLoad->setDebugLoc(DL); + if (TBAATag) PreheaderLoad->setMetadata(LLVMContext::MD_tbaa, TBAATag); SSA.AddAvailableValue(Preheader, PreheaderLoad); // Rewrite all the loads in the loop and remember all the definitions from diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 3771f5a..9c67e32 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -16,11 +16,11 @@ #define DEBUG_TYPE "loop-delete" #include "llvm/Transforms/Scalar.h" -#include "llvm/Analysis/LoopPass.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallVector.h" using namespace llvm; STATISTIC(NumDeleted, "Number of loops deleted"); diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a72e288..c4f9012 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -43,18 +43,19 @@ #define DEBUG_TYPE "loop-idiom" #include "llvm/Transforms/Scalar.h" -#include "llvm/IRBuilder.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Module.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -63,16 +64,83 @@ STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); namespace { + + class LoopIdiomRecognize; + + /// This class defines some utility functions for loop idiom recognization. + class LIRUtil { + public: + /// Return true iff the block contains nothing but an uncondition branch + /// (aka goto instruction). + static bool isAlmostEmpty(BasicBlock *); + + static BranchInst *getBranch(BasicBlock *BB) { + return dyn_cast<BranchInst>(BB->getTerminator()); + } + + /// Return the condition of the branch terminating the given basic block. + static Value *getBrCondtion(BasicBlock *); + + /// Derive the precondition block (i.e the block that guards the loop + /// preheader) from the given preheader. + static BasicBlock *getPrecondBb(BasicBlock *PreHead); + }; + + /// This class is to recoginize idioms of population-count conducted in + /// a noncountable loop. Currently it only recognizes this pattern: + /// \code + /// while(x) {cnt++; ...; x &= x - 1; ...} + /// \endcode + class NclPopcountRecognize { + LoopIdiomRecognize &LIR; + Loop *CurLoop; + BasicBlock *PreCondBB; + + typedef IRBuilder<> IRBuilderTy; + + public: + explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR); + bool recognize(); + + private: + /// Take a glimpse of the loop to see if we need to go ahead recoginizing + /// the idiom. + bool preliminaryScreen(); + + /// Check if the given conditional branch is based on the comparison + /// beween a variable and zero, and if the variable is non-zero, the + /// control yeilds to the loop entry. If the branch matches the behavior, + /// the variable involved in the comparion is returned. This function will + /// be called to see if the precondition and postcondition of the loop + /// are in desirable form. + Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const; + + /// Return true iff the idiom is detected in the loop. and 1) \p CntInst + /// is set to the instruction counting the pupulation bit. 2) \p CntPhi + /// is set to the corresponding phi node. 3) \p Var is set to the value + /// whose population bits are being counted. + bool detectIdiom + (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; + + /// Insert ctpop intrinsic function and some obviously dead instructions. + void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var); + + /// Create llvm.ctpop.* intrinsic function. + CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); + }; + class LoopIdiomRecognize : public LoopPass { Loop *CurLoop; - const TargetData *TD; + const DataLayout *TD; DominatorTree *DT; ScalarEvolution *SE; TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; public: static char ID; explicit LoopIdiomRecognize() : LoopPass(ID) { initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + TD = 0; DT = 0; SE = 0; TLI = 0; TTI = 0; } bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -109,7 +177,34 @@ namespace { AU.addPreserved<DominatorTree>(); AU.addRequired<DominatorTree>(); AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetTransformInfo>(); + } + + const DataLayout *getDataLayout() { + return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>(); + } + + DominatorTree *getDominatorTree() { + return DT ? DT : (DT=&getAnalysis<DominatorTree>()); + } + + ScalarEvolution *getScalarEvolution() { + return SE ? SE : (SE = &getAnalysis<ScalarEvolution>()); + } + + TargetLibraryInfo *getTargetLibraryInfo() { + return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>()); + } + + const TargetTransformInfo *getTargetTransformInfo() { + return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>()); } + + Loop *getLoop() const { return CurLoop; } + + private: + bool runOnNoncountableLoop(); + bool runOnCountableLoop(); }; } @@ -123,6 +218,7 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) @@ -172,19 +268,393 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, deleteDeadInstruction(I, SE, TLI); } -bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { - CurLoop = L; +//===----------------------------------------------------------------------===// +// +// Implementation of LIRUtil +// +//===----------------------------------------------------------------------===// - // Disable loop idiom recognition if the function's name is a common idiom. - StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy") +// This fucntion will return true iff the given block contains nothing but goto. +// A typical usage of this function is to check if the preheader fucntion is +// "almost" empty such that generated intrinsic function can be moved across +// preheader and to be placed at the end of the preconditiona block without +// concerning of breaking data dependence. +bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { + if (BranchInst *Br = getBranch(BB)) { + return Br->isUnconditional() && BB->size() == 1; + } + return false; +} + +Value *LIRUtil::getBrCondtion(BasicBlock *BB) { + BranchInst *Br = getBranch(BB); + return Br ? Br->getCondition() : 0; +} + +BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { + if (BasicBlock *BB = PreHead->getSinglePredecessor()) { + BranchInst *Br = getBranch(BB); + return Br && Br->isConditional() ? BB : 0; + } + return 0; +} + +//===----------------------------------------------------------------------===// +// +// Implementation of NclPopcountRecognize +// +//===----------------------------------------------------------------------===// + +NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): + LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) { +} + +bool NclPopcountRecognize::preliminaryScreen() { + const TargetTransformInfo *TTI = LIR.getTargetTransformInfo(); + if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) return false; - // The trip count of the loop must be analyzable. - SE = &getAnalysis<ScalarEvolution>(); - if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + // Counting population are usually conducted by few arithmetic instrutions. + // Such instructions can be easilly "absorbed" by vacant slots in a + // non-compact loop. Therefore, recognizing popcount idiom only makes sense + // in a compact loop. + + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + BasicBlock *LoopBody = *(CurLoop->block_begin()); + if (LoopBody->size() >= 20) { + // The loop is too big, bail out. + return false; + } + + // It should have a preheader containing nothing but a goto instruction. + BasicBlock *PreHead = CurLoop->getLoopPreheader(); + if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead)) + return false; + + // It should have a precondition block where the generated popcount instrinsic + // function will be inserted. + PreCondBB = LIRUtil::getPrecondBb(PreHead); + if (!PreCondBB) + return false; + + return true; +} + +Value *NclPopcountRecognize::matchCondition (BranchInst *Br, + BasicBlock *LoopEntry) const { + if (!Br || !Br->isConditional()) + return 0; + + ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); + if (!Cond) + return 0; + + ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); + if (!CmpZero || !CmpZero->isZero()) + return 0; + + ICmpInst::Predicate Pred = Cond->getPredicate(); + if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || + (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) + return Cond->getOperand(0); + + return 0; +} + +bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, + PHINode *&CntPhi, + Value *&Var) const { + // Following code tries to detect this idiom: + // + // if (x0 != 0) + // goto loop-exit // the precondition of the loop + // cnt0 = init-val; + // do { + // x1 = phi (x0, x2); + // cnt1 = phi(cnt0, cnt2); + // + // cnt2 = cnt1 + 1; + // ... + // x2 = x1 & (x1 - 1); + // ... + // } while(x != 0); + // + // loop-exit: + // + + // step 1: Check to see if the look-back branch match this pattern: + // "if (a!=0) goto loop-entry". + BasicBlock *LoopEntry; + Instruction *DefX2, *CountInst; + Value *VarX1, *VarX0; + PHINode *PhiX, *CountPhi; + + DefX2 = CountInst = 0; + VarX1 = VarX0 = 0; + PhiX = CountPhi = 0; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + { + if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry)) + DefX2 = dyn_cast<Instruction>(T); + else + return false; + } + + // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" + { + if (DefX2->getOpcode() != Instruction::And) + return false; + + BinaryOperator *SubOneOp; + + if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) + VarX1 = DefX2->getOperand(1); + else { + VarX1 = DefX2->getOperand(0); + SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); + } + if (!SubOneOp) + return false; + + Instruction *SubInst = cast<Instruction>(SubOneOp); + ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); + if (!Dec || + !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || + (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) { + return false; + } + } + + // step 3: Check the recurrence of variable X + { + PhiX = dyn_cast<PHINode>(VarX1); + if (!PhiX || + (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { + return false; + } + } + + // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 + { + CountInst = NULL; + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), + IterE = LoopEntry->end(); Iter != IterE; Iter++) { + Instruction *Inst = Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); + if (!Phi || Phi->getParent() != LoopEntry) + continue; + + // Check if the result of the instruction is live of the loop. + bool LiveOutLoop = false; + for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); + I != E; I++) { + if ((cast<Instruction>(*I))->getParent() != LoopEntry) { + LiveOutLoop = true; break; + } + } + + if (LiveOutLoop) { + CountInst = Inst; + CountPhi = Phi; + break; + } + } + + if (!CountInst) + return false; + } + + // step 5: check if the precondition is in this form: + // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" + { + BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); + Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader()); + if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) + return false; + + CntInst = CountInst; + CntPhi = CountPhi; + Var = T; + } + + return true; +} + +void NclPopcountRecognize::transform(Instruction *CntInst, + PHINode *CntPhi, Value *Var) { + + ScalarEvolution *SE = LIR.getScalarEvolution(); + TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo(); + BasicBlock *PreHead = CurLoop->getLoopPreheader(); + BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); + const DebugLoc DL = CntInst->getDebugLoc(); + + // Assuming before transformation, the loop is following: + // if (x) // the precondition + // do { cnt++; x &= x - 1; } while(x); + + // Step 1: Insert the ctpop instruction at the end of the precondition block + IRBuilderTy Builder(PreCondBr); + Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; + { + PopCnt = createPopcntIntrinsic(Builder, Var, DL); + NewCount = PopCntZext = + Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); + + if (NewCount != PopCnt) + (cast<Instruction>(NewCount))->setDebugLoc(DL); + + // TripCnt is exactly the number of iterations the loop has + TripCnt = NewCount; + + // If the popoulation counter's initial value is not zero, insert Add Inst. + Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); + ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); + if (!InitConst || !InitConst->isZero()) { + NewCount = Builder.CreateAdd(NewCount, CntInitVal); + (cast<Instruction>(NewCount))->setDebugLoc(DL); + } + } + + // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to + // "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic + // function would be partial dead code, and downstream passes will drag + // it back from the precondition block to the preheader. + { + ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); + + Value *Opnd0 = PopCntZext; + Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); + if (PreCond->getOperand(0) != Var) + std::swap(Opnd0, Opnd1); + + ICmpInst *NewPreCond = + cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); + PreCond->replaceAllUsesWith(NewPreCond); + + deleteDeadInstruction(PreCond, *SE, TLI); + } + + // Step 3: Note that the population count is exactly the trip count of the + // loop in question, which enble us to to convert the loop from noncountable + // loop into a countable one. The benefit is twofold: + // + // - If the loop only counts population, the entire loop become dead after + // the transformation. It is lots easier to prove a countable loop dead + // than to prove a noncountable one. (In some C dialects, a infite loop + // isn't dead even if it computes nothing useful. In general, DCE needs + // to prove a noncountable loop finite before safely delete it.) + // + // - If the loop also performs something else, it remains alive. + // Since it is transformed to countable form, it can be aggressively + // optimized by some optimizations which are in general not applicable + // to a noncountable loop. + // + // After this step, this loop (conceptually) would look like following: + // newcnt = __builtin_ctpop(x); + // t = newcnt; + // if (x) + // do { cnt++; x &= x-1; t--) } while (t > 0); + BasicBlock *Body = *(CurLoop->block_begin()); + { + BranchInst *LbBr = LIRUtil::getBranch(Body); + ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); + Type *Ty = TripCnt->getType(); + + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin()); + + Builder.SetInsertPoint(LbCond); + Value *Opnd1 = cast<Value>(TcPhi); + Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1)); + Instruction *TcDec = + cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true)); + + TcPhi->addIncoming(TripCnt, PreHead); + TcPhi->addIncoming(TcDec, Body); + + CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ? + CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; + LbCond->setPredicate(Pred); + LbCond->setOperand(0, TcDec); + LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0))); + } + + // Step 4: All the references to the original population counter outside + // the loop are replaced with the NewCount -- the value returned from + // __builtin_ctpop(). + { + SmallVector<Value *, 4> CntUses; + for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end(); + I != E; I++) { + if (cast<Instruction>(*I)->getParent() != Body) + CntUses.push_back(*I); + } + for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { + (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); + } + } + + // step 5: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + SE->forgetLoop(CurLoop); +} + +CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, + Value *Val, DebugLoc DL) { + Value *Ops[] = { Val }; + Type *Tys[] = { Val->getType() }; + + Module *M = (*(CurLoop->block_begin()))->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +/// recognize - detect population count idiom in a non-countable loop. If +/// detected, transform the relevant code to popcount intrinsic function +/// call, and return true; otherwise, return false. +bool NclPopcountRecognize::recognize() { + + if (!LIR.getTargetTransformInfo()) + return false; + + LIR.getScalarEvolution(); + + if (!preliminaryScreen()) return false; - const SCEV *BECount = SE->getBackedgeTakenCount(L); + + Instruction *CntInst; + PHINode *CntPhi; + Value *Val; + if (!detectIdiom(CntInst, CntPhi, Val)) + return false; + + transform(CntInst, CntPhi, Val); + return true; +} + +//===----------------------------------------------------------------------===// +// +// Implementation of LoopIdiomRecognize +// +//===----------------------------------------------------------------------===// + +bool LoopIdiomRecognize::runOnCountableLoop() { + const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); if (isa<SCEVCouldNotCompute>(BECount)) return false; // If this loop executes exactly one time, then it should be peeled, not @@ -194,24 +664,29 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { return false; // We require target data for now. - TD = getAnalysisIfAvailable<TargetData>(); - if (TD == 0) return false; + if (!getDataLayout()) + return false; + + // set DT + (void)getDominatorTree(); - DT = &getAnalysis<DominatorTree>(); LoopInfo &LI = getAnalysis<LoopInfo>(); TLI = &getAnalysis<TargetLibraryInfo>(); + // set TLI + (void)getTargetLibraryInfo(); + SmallVector<BasicBlock*, 8> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); DEBUG(dbgs() << "loop-idiom Scanning: F[" - << L->getHeader()->getParent()->getName() - << "] Loop %" << L->getHeader()->getName() << "\n"); + << CurLoop->getHeader()->getParent()->getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); bool MadeChange = false; // Scan all the blocks in the loop that are not in subloops. - for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; - ++BI) { + for (Loop::block_iterator BI = CurLoop->block_begin(), + E = CurLoop->block_end(); BI != E; ++BI) { // Ignore blocks in subloops. if (LI.getLoopFor(*BI) != CurLoop) continue; @@ -221,6 +696,33 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { return MadeChange; } +bool LoopIdiomRecognize::runOnNoncountableLoop() { + NclPopcountRecognize Popcount(*this); + if (Popcount.recognize()) + return true; + + return false; +} + +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + CurLoop = L; + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) + return false; + + // Disable loop idiom recognition if the function's name is a common idiom. + StringRef Name = L->getHeader()->getParent()->getName(); + if (Name == "memset" || Name == "memcpy") + return false; + + SE = &getAnalysis<ScalarEvolution>(); + if (SE->hasLoopInvariantBackedgeTakenCount(L)) + return runOnCountableLoop(); + return runOnNoncountableLoop(); +} + /// runOnLoopBlock - Process the specified block, which lives in a counted loop /// with the specified backedge count. This block is known to be in the current /// loop and not in any subloops. @@ -403,7 +905,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, /// /// Note that we don't ever attempt to use memset_pattern8 or 4, because these /// just replicate their input array and then pass on to memset_pattern16. -static Constant *getMemSetPatternValue(Value *V, const TargetData &TD) { +static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) { // If the value isn't a constant, we can't promote it to being in a constant // array. We could theoretically do a store to an alloca or something, but // that doesn't seem worthwhile. diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index f5daa7b..c48808f 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -12,17 +12,17 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "loop-instsimplify" -#include "llvm/Instructions.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumSimplified, "Number of redundant instructions simplified"); @@ -66,7 +66,7 @@ Pass *llvm::createLoopInstSimplifyPass() { bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); LoopInfo *LI = &getAnalysis<LoopInfo>(); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SmallVector<BasicBlock*, 8> ExitBlocks; diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index abe07aa..0ea80f3 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,20 +13,20 @@ #define DEBUG_TYPE "loop-rotate" #include "llvm/Transforms/Scalar.h" -#include "llvm/Function.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/Debug.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; #define MAX_HEADER_SIZE 16 @@ -274,10 +274,16 @@ bool LoopRotate::rotateLoop(Loop *L) { if (OrigLatch == 0 || L->isLoopExiting(OrigLatch)) return false; - // Check size of original header and reject loop if it is very big. + // Check size of original header and reject loop if it is very big or we can't + // duplicate blocks inside it. { CodeMetrics Metrics; Metrics.analyzeBasicBlock(OrigHeader); + if (Metrics.notDuplicatable) { + DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non duplicatable" + << " instructions: "; L->dump()); + return false; + } if (Metrics.NumInsts > MAX_HEADER_SIZE) return false; } diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index d7495da..c7b853e 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -37,8 +37,8 @@ // // TODO: Handle multiple loops at a time. // -// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr -// instead of a GlobalValue? +// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead +// of a GlobalValue? // // TODO: When truncation is free, truncate ICmp users' operands to make it a // smaller encoding (on x86 at least). @@ -55,25 +55,25 @@ #define DEBUG_TYPE "loop-reduce" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Analysis/IVUsers.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Assembly/Writer.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/SmallBitVector.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/Support/Debug.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; @@ -121,7 +121,7 @@ void RegSortData::print(raw_ostream &OS) const { OS << "[NumUses=" << UsedByIndices.count() << ']'; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void RegSortData::dump() const { print(errs()); errs() << '\n'; } @@ -223,16 +223,24 @@ namespace { /// computing satisfying a use. It may include broken-out immediates and scaled /// registers. struct Formula { - /// AM - This is used to represent complex addressing, as well as other kinds - /// of interesting uses. - TargetLowering::AddrMode AM; + /// Global base address used for complex addressing. + GlobalValue *BaseGV; + + /// Base offset for complex addressing. + int64_t BaseOffset; + + /// Whether any complex addressing has a base register. + bool HasBaseReg; + + /// The scale of any complex addressing. + int64_t Scale; /// BaseRegs - The list of "base" registers for this use. When this is - /// non-empty, AM.HasBaseReg should be set to true. + /// non-empty, SmallVector<const SCEV *, 2> BaseRegs; /// ScaledReg - The 'scaled' register for this use. This should be non-null - /// when AM.Scale is not zero. + /// when Scale is not zero. const SCEV *ScaledReg; /// UnfoldedOffset - An additional constant offset which added near the @@ -240,7 +248,9 @@ struct Formula { /// live in an add immediate field rather than a register. int64_t UnfoldedOffset; - Formula() : ScaledReg(0), UnfoldedOffset(0) {} + Formula() + : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0), + UnfoldedOffset(0) {} void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); @@ -326,13 +336,13 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { const SCEV *Sum = SE.getAddExpr(Good); if (!Sum->isZero()) BaseRegs.push_back(Sum); - AM.HasBaseReg = true; + HasBaseReg = true; } if (!Bad.empty()) { const SCEV *Sum = SE.getAddExpr(Bad); if (!Sum->isZero()) BaseRegs.push_back(Sum); - AM.HasBaseReg = true; + HasBaseReg = true; } } @@ -348,7 +358,7 @@ unsigned Formula::getNumRegs() const { Type *Formula::getType() const { return !BaseRegs.empty() ? BaseRegs.front()->getType() : ScaledReg ? ScaledReg->getType() : - AM.BaseGV ? AM.BaseGV->getType() : + BaseGV ? BaseGV->getType() : 0; } @@ -381,29 +391,29 @@ bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, void Formula::print(raw_ostream &OS) const { bool First = true; - if (AM.BaseGV) { + if (BaseGV) { if (!First) OS << " + "; else First = false; - WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false); + WriteAsOperand(OS, BaseGV, /*PrintType=*/false); } - if (AM.BaseOffs != 0) { + if (BaseOffset != 0) { if (!First) OS << " + "; else First = false; - OS << AM.BaseOffs; + OS << BaseOffset; } for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), E = BaseRegs.end(); I != E; ++I) { if (!First) OS << " + "; else First = false; OS << "reg(" << **I << ')'; } - if (AM.HasBaseReg && BaseRegs.empty()) { + if (HasBaseReg && BaseRegs.empty()) { if (!First) OS << " + "; else First = false; OS << "**error: HasBaseReg**"; - } else if (!AM.HasBaseReg && !BaseRegs.empty()) { + } else if (!HasBaseReg && !BaseRegs.empty()) { if (!First) OS << " + "; else First = false; OS << "**error: !HasBaseReg**"; } - if (AM.Scale != 0) { + if (Scale != 0) { if (!First) OS << " + "; else First = false; - OS << AM.Scale << "*reg("; + OS << Scale << "*reg("; if (ScaledReg) OS << *ScaledReg; else @@ -416,7 +426,7 @@ void Formula::print(raw_ostream &OS) const { } } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Formula::dump() const { print(errs()); errs() << '\n'; } @@ -926,8 +936,8 @@ void Cost::RateFormula(const Formula &F, // Tally up the non-zero immediates. for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), E = Offsets.end(); I != E; ++I) { - int64_t Offset = (uint64_t)*I + F.AM.BaseOffs; - if (F.AM.BaseGV) + int64_t Offset = (uint64_t)*I + F.BaseOffset; + if (F.BaseGV) ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. else if (Offset != 0) @@ -978,7 +988,7 @@ void Cost::print(raw_ostream &OS) const { OS << ", plus " << SetupCost << " setup cost"; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Cost::dump() const { print(errs()); errs() << '\n'; } @@ -1066,7 +1076,7 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", Offset=" << Offset; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRFixup::dump() const { print(errs()); errs() << '\n'; } @@ -1260,7 +1270,7 @@ void LSRUse::print(raw_ostream &OS) const { OS << ", widest fixup type: " << *WidestFixupType; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRUse::dump() const { print(errs()); errs() << '\n'; } @@ -1269,46 +1279,42 @@ void LSRUse::dump() const { /// isLegalUse - Test whether the use described by AM is "legal", meaning it can /// be completely folded into the user instruction at isel time. This includes /// address-mode folding and special icmp tricks. -static bool isLegalUse(const TargetLowering::AddrMode &AM, - LSRUse::KindType Kind, Type *AccessTy, - const TargetLowering *TLI) { +static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind, + Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, + bool HasBaseReg, int64_t Scale) { switch (Kind) { case LSRUse::Address: - // If we have low-level target information, ask the target if it can - // completely fold this address. - if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy); + return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); // Otherwise, just guess that reg+reg addressing is legal. - return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1; + //return ; case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to // fold a GV into an ICmp. - if (AM.BaseGV) + if (BaseGV) return false; // ICmp only has two operands; don't allow more than two non-trivial parts. - if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0) + if (Scale != 0 && HasBaseReg && BaseOffset != 0) return false; // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by // putting the scaled register in the other operand of the icmp. - if (AM.Scale != 0 && AM.Scale != -1) + if (Scale != 0 && Scale != -1) return false; // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. - if (AM.BaseOffs != 0) { - if (!TLI) - return false; + if (BaseOffset != 0) { // We have one of: - // ICmpZero BaseReg + Offset => ICmp BaseReg, -Offset - // ICmpZero -1*ScaleReg + Offset => ICmp ScaleReg, Offset + // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset + // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset // Offs is the ICmp immediate. - int64_t Offs = AM.BaseOffs; - if (AM.Scale == 0) - Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN. - return TLI->isLegalICmpImmediate(Offs); + if (Scale == 0) + // The cast does the right thing with INT64_MIN. + BaseOffset = -(uint64_t)BaseOffset; + return TTI.isLegalICmpImmediate(BaseOffset); } // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg @@ -1316,92 +1322,87 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, case LSRUse::Basic: // Only handle single-register values. - return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0; + return !BaseGV && Scale == 0 && BaseOffset == 0; case LSRUse::Special: // Special case Basic to handle -1 scales. - return !AM.BaseGV && (AM.Scale == 0 || AM.Scale == -1) && AM.BaseOffs == 0; + return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0; } llvm_unreachable("Invalid LSRUse Kind!"); } -static bool isLegalUse(TargetLowering::AddrMode AM, - int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, - const TargetLowering *TLI) { +static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, + int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, + GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, + int64_t Scale) { // Check for overflow. - if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) != + if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != (MinOffset > 0)) return false; - AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset; - if (isLegalUse(AM, Kind, AccessTy, TLI)) { - AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset; - // Check for overflow. - if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) != - (MaxOffset > 0)) - return false; - AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset; - return isLegalUse(AM, Kind, AccessTy, TLI); - } - return false; + MinOffset = (uint64_t)BaseOffset + MinOffset; + if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != + (MaxOffset > 0)) + return false; + MaxOffset = (uint64_t)BaseOffset + MaxOffset; + + return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, + Scale) && + isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale); +} + +static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, + int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, + const Formula &F) { + return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, + F.BaseOffset, F.HasBaseReg, F.Scale); } -static bool isAlwaysFoldable(int64_t BaseOffs, - GlobalValue *BaseGV, - bool HasBaseReg, +static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, Type *AccessTy, - const TargetLowering *TLI) { + GlobalValue *BaseGV, int64_t BaseOffset, + bool HasBaseReg) { // Fast-path: zero is always foldable. - if (BaseOffs == 0 && !BaseGV) return true; + if (BaseOffset == 0 && !BaseGV) return true; // Conservatively, create an address with an immediate and a // base and a scale. - TargetLowering::AddrMode AM; - AM.BaseOffs = BaseOffs; - AM.BaseGV = BaseGV; - AM.HasBaseReg = HasBaseReg; - AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; // Canonicalize a scale of 1 to a base register if the formula doesn't // already have a base register. - if (!AM.HasBaseReg && AM.Scale == 1) { - AM.Scale = 0; - AM.HasBaseReg = true; + if (!HasBaseReg && Scale == 1) { + Scale = 0; + HasBaseReg = true; } - return isLegalUse(AM, Kind, AccessTy, TLI); + return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); } -static bool isAlwaysFoldable(const SCEV *S, - int64_t MinOffset, int64_t MaxOffset, - bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy, - const TargetLowering *TLI, - ScalarEvolution &SE) { +static bool isAlwaysFoldable(const TargetTransformInfo &TTI, + ScalarEvolution &SE, int64_t MinOffset, + int64_t MaxOffset, LSRUse::KindType Kind, + Type *AccessTy, const SCEV *S, bool HasBaseReg) { // Fast-path: zero is always foldable. if (S->isZero()) return true; // Conservatively, create an address with an immediate and a // base and a scale. - int64_t BaseOffs = ExtractImmediate(S, SE); + int64_t BaseOffset = ExtractImmediate(S, SE); GlobalValue *BaseGV = ExtractSymbol(S, SE); // If there's anything else involved, it's not foldable. if (!S->isZero()) return false; // Fast-path: zero is always foldable. - if (BaseOffs == 0 && !BaseGV) return true; + if (BaseOffset == 0 && !BaseGV) return true; // Conservatively, create an address with an immediate and a // base and a scale. - TargetLowering::AddrMode AM; - AM.BaseOffs = BaseOffs; - AM.BaseGV = BaseGV; - AM.HasBaseReg = HasBaseReg; - AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; - return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI); + return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, + BaseOffset, HasBaseReg, Scale); } namespace { @@ -1501,7 +1502,7 @@ class LSRInstance { ScalarEvolution &SE; DominatorTree &DT; LoopInfo &LI; - const TargetLowering *const TLI; + const TargetTransformInfo &TTI; Loop *const L; bool Changed; @@ -1637,7 +1638,7 @@ class LSRInstance { Pass *P); public: - LSRInstance(const TargetLowering *tli, Loop *l, Pass *P); + LSRInstance(Loop *L, Pass *P); bool getChanged() const { return Changed; } @@ -1687,12 +1688,9 @@ void LSRInstance::OptimizeShadowIV() { } if (!DestTy) continue; - if (TLI) { - // If target does not support DestTy natively then do not apply - // this transformation. - EVT DVT = TLI->getValueType(DestTy); - if (!TLI->isTypeLegal(DVT)) continue; - } + // If target does not support DestTy natively then do not apply + // this transformation. + if (!TTI.isTypeLegal(DestTy)) continue; PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); if (!PH) continue; @@ -2014,18 +2012,17 @@ LSRInstance::OptimizeLoopTermCond() { if (C->getValue().getMinSignedBits() >= 64 || C->getValue().isMinSignedValue()) goto decline_post_inc; - // Without TLI, assume that any stride might be valid, and so any - // use might be shared. - if (!TLI) - goto decline_post_inc; // Check for possible scaled-address reuse. Type *AccessTy = getAccessType(UI->getUser()); - TargetLowering::AddrMode AM; - AM.Scale = C->getSExtValue(); - if (TLI->isLegalAddressingMode(AM, AccessTy)) + int64_t Scale = C->getSExtValue(); + if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, + /*BaseOffset=*/ 0, + /*HasBaseReg=*/ false, Scale)) goto decline_post_inc; - AM.Scale = -AM.Scale; - if (TLI->isLegalAddressingMode(AM, AccessTy)) + Scale = -Scale; + if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, + /*BaseOffset=*/ 0, + /*HasBaseReg=*/ false, Scale)) goto decline_post_inc; } } @@ -2095,13 +2092,13 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, return false; // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { - if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg, - Kind, AccessTy, TLI)) + if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + LU.MaxOffset - NewOffset, HasBaseReg)) return false; NewMinOffset = NewOffset; } else if (NewOffset > LU.MaxOffset) { - if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg, - Kind, AccessTy, TLI)) + if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + NewOffset - LU.MinOffset, HasBaseReg)) return false; NewMaxOffset = NewOffset; } @@ -2130,7 +2127,8 @@ LSRInstance::getUse(const SCEV *&Expr, int64_t Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. - if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) { + if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, + Offset, /*HasBaseReg=*/ true)) { Expr = Copy; Offset = 0; } @@ -2198,10 +2196,10 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, // as OrigF. if (F.BaseRegs == OrigF.BaseRegs && F.ScaledReg == OrigF.ScaledReg && - F.AM.BaseGV == OrigF.AM.BaseGV && - F.AM.Scale == OrigF.AM.Scale && + F.BaseGV == OrigF.BaseGV && + F.Scale == OrigF.Scale && F.UnfoldedOffset == OrigF.UnfoldedOffset) { - if (F.AM.BaseOffs == 0) + if (F.BaseOffset == 0) return &LU; // This is the formula where all the registers and symbols matched; // there aren't going to be any others. Since we declined it, we @@ -2395,7 +2393,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr, /// TODO: Consider IVInc free if it's already used in another chains. static bool isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, - ScalarEvolution &SE, const TargetLowering *TLI) { + ScalarEvolution &SE, const TargetTransformInfo &TTI) { if (StressIVChain) return true; @@ -2653,7 +2651,7 @@ void LSRInstance::CollectChains() { for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); UsersIdx < NChains; ++UsersIdx) { if (!isProfitableChain(IVChainVec[UsersIdx], - ChainUsersVec[UsersIdx].FarUsers, SE, TLI)) + ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) continue; // Preserve the chain at UsesIdx. if (ChainIdx != UsersIdx) @@ -2680,7 +2678,7 @@ void LSRInstance::FinalizeChain(IVChain &Chain) { /// Return true if the IVInc can be folded into an addressing mode. static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, - Value *Operand, const TargetLowering *TLI) { + Value *Operand, const TargetTransformInfo &TTI) { const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); if (!IncConst || !isAddressUse(UserInst, Operand)) return false; @@ -2689,8 +2687,9 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, return false; int64_t IncOffset = IncConst->getValue()->getSExtValue(); - if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false, - LSRUse::Address, getAccessType(UserInst), TLI)) + if (!isAlwaysFoldable(TTI, LSRUse::Address, + getAccessType(UserInst), /*BaseGV=*/ 0, + IncOffset, /*HaseBaseReg=*/ false)) return false; return true; @@ -2761,7 +2760,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, // If an IV increment can't be folded, use it as the next IV value. if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand, - TLI)) { + TTI)) { assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); IVSrc = IVOper; LeftOverExpr = 0; @@ -2892,6 +2891,7 @@ void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { Formula F; F.InitialMatch(S, L, SE); + F.HasBaseReg = true; bool Inserted = InsertFormula(LU, LUIdx, F); assert(Inserted && "Initial formula already exists!"); (void)Inserted; } @@ -2903,7 +2903,6 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { Formula F; F.BaseRegs.push_back(S); - F.AM.HasBaseReg = true; bool Inserted = InsertFormula(LU, LUIdx, F); assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; } @@ -3105,9 +3104,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Don't pull a constant into a register if the constant could be folded // into an immediate field. - if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset, - Base.getNumRegs() > 1, - LU.Kind, LU.AccessTy, TLI, SE)) + if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, *J, Base.getNumRegs() > 1)) continue; // Collect all operands except *J. @@ -3119,9 +3117,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Don't leave just a constant behind in a register if the constant could // be folded into an immediate field. if (InnerAddOps.size() == 1 && - isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset, - Base.getNumRegs() > 1, - LU.Kind, LU.AccessTy, TLI, SE)) + isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) continue; const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); @@ -3131,10 +3128,10 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Add the remaining pieces of the add back into the new formula. const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); - if (TLI && InnerSumSC && + if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && - TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + - InnerSumSC->getValue()->getZExtValue())) { + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue())) { F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); F.BaseRegs.erase(F.BaseRegs.begin() + i); @@ -3143,9 +3140,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Add J as its own register, or an unfolded immediate. const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); - if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && - TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + - SC->getValue()->getZExtValue())) + if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue())) F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); else @@ -3194,7 +3191,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // We can't add a symbolic offset if the address already contains one. - if (Base.AM.BaseGV) return; + if (Base.BaseGV) return; for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { const SCEV *G = Base.BaseRegs[i]; @@ -3202,9 +3199,8 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, if (G->isZero() || !GV) continue; Formula F = Base; - F.AM.BaseGV = GV; - if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) + F.BaseGV = GV; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) continue; F.BaseRegs[i] = G; (void)InsertFormula(LU, LUIdx, F); @@ -3227,9 +3223,9 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), E = Worklist.end(); I != E; ++I) { Formula F = Base; - F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I; - if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I, - LU.Kind, LU.AccessTy, TLI)) { + F.BaseOffset = (uint64_t)Base.BaseOffset - *I; + if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, + LU.AccessTy, F)) { // Add the offset to the base register. const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); // If it cancelled out, drop the base register, otherwise update it. @@ -3247,9 +3243,8 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, if (G->isZero() || Imm == 0) continue; Formula F = Base; - F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm; - if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) + F.BaseOffset = (uint64_t)F.BaseOffset + Imm; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) continue; F.BaseRegs[i] = G; (void)InsertFormula(LU, LUIdx, F); @@ -3270,7 +3265,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Don't do this if there is more than one offset. if (LU.MinOffset != LU.MaxOffset) return; - assert(!Base.AM.BaseGV && "ICmpZero use is not legal!"); + assert(!Base.BaseGV && "ICmpZero use is not legal!"); // Check each interesting stride. for (SmallSetVector<int64_t, 8>::const_iterator @@ -3278,10 +3273,10 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, int64_t Factor = *I; // Check that the multiplication doesn't overflow. - if (Base.AM.BaseOffs == INT64_MIN && Factor == -1) + if (Base.BaseOffset == INT64_MIN && Factor == -1) continue; - int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor; - if (NewBaseOffs / Factor != Base.AM.BaseOffs) + int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; + if (NewBaseOffset / Factor != Base.BaseOffset) continue; // Check that multiplying with the use offset doesn't overflow. @@ -3293,14 +3288,14 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, continue; Formula F = Base; - F.AM.BaseOffs = NewBaseOffs; + F.BaseOffset = NewBaseOffset; // Check that this scale is legal. - if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI)) + if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F)) continue; // Compensate for the use having MinOffset built into it. - F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset; + F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; const SCEV *FactorS = SE.getConstant(IntTy, Factor); @@ -3341,23 +3336,23 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { if (!IntTy) return; // If this Formula already has a scaled register, we can't add another one. - if (Base.AM.Scale != 0) return; + if (Base.Scale != 0) return; // Check each interesting stride. for (SmallSetVector<int64_t, 8>::const_iterator I = Factors.begin(), E = Factors.end(); I != E; ++I) { int64_t Factor = *I; - Base.AM.Scale = Factor; - Base.AM.HasBaseReg = Base.BaseRegs.size() > 1; + Base.Scale = Factor; + Base.HasBaseReg = Base.BaseRegs.size() > 1; // Check whether this scale is going to be legal. - if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) { + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + Base)) { // As a special-case, handle special out-of-loop Basic users specially. // TODO: Reconsider this special case. if (LU.Kind == LSRUse::Basic && - isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, - LSRUse::Special, LU.AccessTy, TLI) && + isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special, + LU.AccessTy, Base) && LU.AllFixupsOutsideLoop) LU.Kind = LSRUse::Special; else @@ -3366,7 +3361,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // For an ICmpZero, negating a solitary base register won't lead to // new solutions. if (LU.Kind == LSRUse::ICmpZero && - !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV) + !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) continue; // For each addrec base reg, apply the scale, if possible. for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) @@ -3390,11 +3385,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { /// GenerateTruncates - Generate reuse formulae from different IV types. void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { - // This requires TargetLowering to tell us which truncates are free. - if (!TLI) return; - // Don't bother truncating symbolic values. - if (Base.AM.BaseGV) return; + if (Base.BaseGV) return; // Determine the integer type for the base formula. Type *DstTy = Base.getType(); @@ -3404,7 +3396,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { for (SmallSetVector<Type *, 4>::const_iterator I = Types.begin(), E = Types.end(); I != E; ++I) { Type *SrcTy = *I; - if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) { + if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { Formula F = Base; if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); @@ -3446,7 +3438,7 @@ void WorkItem::print(raw_ostream &OS) const { << " , add offset " << Imm; } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void WorkItem::dump() const { print(errs()); errs() << '\n'; } @@ -3551,16 +3543,15 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { const Formula &F = LU.Formulae[L]; // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { - int64_t Offs = (uint64_t)F.AM.BaseOffs + - Imm * (uint64_t)F.AM.Scale; + int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; // Don't create 50 + reg(-50). if (F.referencesReg(SE.getSCEV( - ConstantInt::get(IntTy, -(uint64_t)Offs)))) + ConstantInt::get(IntTy, -(uint64_t)Offset)))) continue; Formula NewF = F; - NewF.AM.BaseOffs = Offs; - if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) + NewF.BaseOffset = Offset; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + NewF)) continue; NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); @@ -3569,9 +3560,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // immediate itself, then the formula isn't worthwhile. if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) if (C->getValue()->isNegative() != - (NewF.AM.BaseOffs < 0) && - (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale)) - .ule(abs64(NewF.AM.BaseOffs))) + (NewF.BaseOffset < 0) && + (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) + .ule(abs64(NewF.BaseOffset))) continue; // OK, looks good. @@ -3583,11 +3574,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (BaseReg != OrigReg) continue; Formula NewF = F; - NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; - if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) { - if (!TLI || - !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, NewF)) { + if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) continue; NewF = F; NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; @@ -3601,11 +3591,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end(); J != JE; ++J) if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J)) - if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt( - abs64(NewF.AM.BaseOffs)) && + if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( + abs64(NewF.BaseOffset)) && (C->getValue()->getValue() + - NewF.AM.BaseOffs).countTrailingZeros() >= - CountTrailingZeros_64(NewF.AM.BaseOffs)) + NewF.BaseOffset).countTrailingZeros() >= + CountTrailingZeros_64(NewF.BaseOffset)) goto skip_formula; // Ok, looks good. @@ -3803,7 +3793,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { Formula NewF = F; - NewF.AM.BaseOffs += C->getValue()->getSExtValue(); + NewF.BaseOffset += C->getValue()->getSExtValue(); NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { @@ -3816,9 +3806,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { } } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) - if (!F.AM.BaseGV) { + if (!F.BaseGV) { Formula NewF = F; - NewF.AM.BaseGV = GV; + NewF.BaseGV = GV; NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { @@ -3861,9 +3851,9 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; - if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) { + if (F.BaseOffset != 0 && F.Scale == 0) { if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) { - if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs, + if (reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/false, LU.Kind, LU.AccessTy)) { DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); @@ -3877,7 +3867,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LSRFixup &Fixup = *I; if (Fixup.LUIdx == LUIdx) { Fixup.LUIdx = LUThatHas - &Uses.front(); - Fixup.Offset += F.AM.BaseOffs; + Fixup.Offset += F.BaseOffset; // Add the new offset to LUThatHas' offset list. if (LUThatHas->Offsets.back() != Fixup.Offset) { LUThatHas->Offsets.push_back(Fixup.Offset); @@ -3897,9 +3887,8 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { bool Any = false; for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { Formula &F = LUThatHas->Formulae[i]; - if (!isLegalUse(F.AM, - LUThatHas->MinOffset, LUThatHas->MaxOffset, - LUThatHas->Kind, LUThatHas->AccessTy, TLI)) { + if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset, + LUThatHas->Kind, LUThatHas->AccessTy, F)) { DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); LUThatHas->DeleteFormula(F); @@ -4307,7 +4296,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Expand the ScaledReg portion. Value *ICmpScaledV = 0; - if (F.AM.Scale != 0) { + if (F.Scale != 0) { const SCEV *ScaledS = F.ScaledReg; // If we're expanding for a post-inc user, make the post-inc adjustment. @@ -4320,7 +4309,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // An interesting way of "folding" with an icmp is to use a negated // scale, which we'll implement by inserting it into the other operand // of the icmp. - assert(F.AM.Scale == -1 && + assert(F.Scale == -1 && "The only scale supported by ICmpZero uses is -1!"); ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP); } else { @@ -4335,20 +4324,20 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); ScaledS = SE.getMulExpr(ScaledS, - SE.getConstant(ScaledS->getType(), F.AM.Scale)); + SE.getConstant(ScaledS->getType(), F.Scale)); Ops.push_back(ScaledS); } } // Expand the GV portion. - if (F.AM.BaseGV) { + if (F.BaseGV) { // Flush the operand list to suppress SCEVExpander hoisting. if (!Ops.empty()) { Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } - Ops.push_back(SE.getUnknown(F.AM.BaseGV)); + Ops.push_back(SE.getUnknown(F.BaseGV)); } // Flush the operand list to suppress SCEVExpander hoisting of both folded and @@ -4360,7 +4349,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } // Expand the immediate portion. - int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset; + int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; if (Offset != 0) { if (LU.Kind == LSRUse::ICmpZero) { // The other interesting way of "folding" with an ICmpZero is to use a @@ -4401,9 +4390,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (LU.Kind == LSRUse::ICmpZero) { ICmpInst *CI = cast<ICmpInst>(LF.UserInst); DeadInsts.push_back(CI->getOperand(1)); - assert(!F.AM.BaseGV && "ICmp does not support folding a global value and " + assert(!F.BaseGV && "ICmp does not support folding a global value and " "a scale at the same time!"); - if (F.AM.Scale == -1) { + if (F.Scale == -1) { if (ICmpScaledV->getType() != OpTy) { Instruction *Cast = CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, @@ -4413,7 +4402,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } CI->setOperand(1, ICmpScaledV); } else { - assert(F.AM.Scale == 0 && + assert(F.Scale == 0 && "ICmp does not support folding a global value and " "a scale at the same time!"); Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), @@ -4464,17 +4453,21 @@ void LSRInstance::RewriteForPHI(PHINode *PN, SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); NewBB = NewBBs[0]; } - - // If PN is outside of the loop and BB is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after BB. - if (L->contains(BB) && !L->contains(PN)) - NewBB->moveBefore(PN->getParent()); - - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); - BB = NewBB; - i = PN->getBasicBlockIndex(BB); + // If NewBB==NULL, then SplitCriticalEdge refused to split because all + // phi predecessors are identical. The simple thing to do is skip + // splitting in this case rather than complicate the API. + if (NewBB) { + // If PN is outside of the loop and BB is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after BB. + if (L->contains(BB) && !L->contains(PN)) + NewBB->moveBefore(PN->getParent()); + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); + BB = NewBB; + i = PN->getBasicBlockIndex(BB); + } } } @@ -4584,13 +4577,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, Changed |= DeleteTriviallyDeadInstructions(DeadInsts); } -LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) - : IU(P->getAnalysis<IVUsers>()), - SE(P->getAnalysis<ScalarEvolution>()), - DT(P->getAnalysis<DominatorTree>()), - LI(P->getAnalysis<LoopInfo>()), - TLI(tli), L(l), Changed(false), IVIncInsertPos(0) { - +LSRInstance::LSRInstance(Loop *L, Pass *P) + : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), + DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()), + TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), + IVIncInsertPos(0) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -4673,14 +4664,14 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) #ifndef NDEBUG // Formulae should be legal. - for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), - E = Uses.end(); I != E; ++I) { - const LSRUse &LU = *I; - for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), - JE = LU.Formulae.end(); J != JE; ++J) - assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI) && - "Illegal formula generated!"); + for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end(); + I != E; ++I) { + const LSRUse &LU = *I; + for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), + JE = LU.Formulae.end(); + J != JE; ++J) + assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + *J) && "Illegal formula generated!"); }; #endif @@ -4743,7 +4734,7 @@ void LSRInstance::print(raw_ostream &OS) const { print_uses(OS); } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRInstance::dump() const { print(errs()); errs() << '\n'; } @@ -4752,13 +4743,9 @@ void LSRInstance::dump() const { namespace { class LoopStrengthReduce : public LoopPass { - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// transformation profitability. - const TargetLowering *const TLI; - public: static char ID; // Pass ID, replacement for typeid - explicit LoopStrengthReduce(const TargetLowering *tli = 0); + LoopStrengthReduce(); private: bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -4770,6 +4757,7 @@ private: char LoopStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(IVUsers) @@ -4779,14 +4767,13 @@ INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) -Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { - return new LoopStrengthReduce(TLI); +Pass *llvm::createLoopStrengthReducePass() { + return new LoopStrengthReduce(); } -LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli) - : LoopPass(ID), TLI(tli) { - initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); - } +LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) { + initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); +} void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // We split critical edges, so we change the CFG. However, we do update @@ -4805,24 +4792,27 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(LoopSimplifyID); AU.addRequired<IVUsers>(); AU.addPreserved<IVUsers>(); + AU.addRequired<TargetTransformInfo>(); } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { bool Changed = false; // Run the main LSR transformation. - Changed |= LSRInstance(TLI, L, this).getChanged(); + Changed |= LSRInstance(L, this).getChanged(); // Remove any extra phis created by processing inner loops. Changed |= DeleteDeadPHIs(L->getHeader()); - if (EnablePhiElim) { + if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector<WeakVH, 16> DeadInsts; SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif - unsigned numFolded = Rewriter. - replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI); + unsigned numFolded = + Rewriter.replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), + DeadInsts, + &getAnalysis<TargetTransformInfo>()); if (numFolded) { Changed = true; DeleteTriviallyDeadInstructions(DeadInsts); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 09a186f..e0f915b 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -13,16 +13,16 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "loop-unroll" -#include "llvm/IntrinsicInst.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/UnrollLoop.h" -#include "llvm/Target/TargetData.h" #include <climits> using namespace llvm; @@ -113,12 +113,13 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - const TargetData *TD) { + bool &NotDuplicatable, const DataLayout *TD) { CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) Metrics.analyzeBasicBlock(*I, TD); NumCalls = Metrics.NumInlineCandidates; + NotDuplicatable = Metrics.notDuplicatable; unsigned LoopSize = Metrics.NumInsts; @@ -145,7 +146,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // not user specified. unsigned Threshold = CurrentThreshold; if (!UserThreshold && - Header->getParent()->hasFnAttr(Attribute::OptimizeForSize)) + Header->getParent()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize)) Threshold = OptSizeUnrollThreshold; // Find trip count and trip multiple if count is not available @@ -178,10 +181,17 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Enforce the threshold. if (Threshold != NoThreshold) { - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); unsigned NumInlineCandidates; - unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD); + bool notDuplicatable; + unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, + notDuplicatable, TD); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + if (notDuplicatable) { + DEBUG(dbgs() << " Not unrolling loop which contains non duplicatable" + << " instructions.\n"); + return false; + } if (NumInlineCandidates != 0) { DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return false; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 58f7739..68d4423 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -28,25 +28,25 @@ #define DEBUG_TYPE "loop-unswitch" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <map> #include <set> @@ -248,6 +248,13 @@ bool LUAnalysisCache::countLoop(const Loop* L) { Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5); Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation); MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount; + + if (Metrics.notDuplicatable) { + DEBUG(dbgs() << "NOT unswitching loop %" + << L->getHeader()->getName() << ", contents cannot be " + << "duplicated!\n"); + return false; + } } if (!Props.CanBeUnswitchedCount) { @@ -638,7 +645,9 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { // Check to see if it would be profitable to unswitch current loop. // Do not do non-trivial unswitch while optimizing for size. - if (OptimizeForSize || F->hasFnAttr(Attribute::OptimizeForSize)) + if (OptimizeForSize || + F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize)) return false; UnswitchNontrivialCondition(LoopCond, Val, currentLoop); @@ -906,13 +915,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, /// specified. static void RemoveFromWorklist(Instruction *I, std::vector<Instruction*> &Worklist) { - std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(), - Worklist.end(), I); - while (WI != Worklist.end()) { - unsigned Offset = WI-Worklist.begin(); - Worklist.erase(WI); - WI = std::find(Worklist.begin()+Offset, Worklist.end(), I); - } + + Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I), + Worklist.end()); } /// ReplaceUsesOfWith - When we find that I really equals V, remove I from the diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp index 7419a65..8ced494 100644 --- a/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/lib/Transforms/Scalar/LowerAtomic.cpp @@ -14,9 +14,9 @@ #define DEBUG_TYPE "loweratomic" #include "llvm/Transforms/Scalar.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" using namespace llvm; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 2a5ee33..be0f0e8 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -14,20 +14,20 @@ #define DEBUG_TYPE "memcpyopt" #include "llvm/Transforms/Scalar.h" -#include "llvm/GlobalVariable.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <list> @@ -38,8 +38,8 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); -static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, - bool &VariableIdxFound, const TargetData &TD){ +static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, + bool &VariableIdxFound, const DataLayout &TD){ // Skip over the first indices. gep_type_iterator GTI = gep_type_begin(GEP); for (unsigned i = 1; i != Idx; ++i, ++GTI) @@ -72,11 +72,11 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, /// constant offset, and return that constant offset. For example, Ptr1 might /// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, - const TargetData &TD) { + const DataLayout &TD) { Ptr1 = Ptr1->stripPointerCasts(); Ptr2 = Ptr2->stripPointerCasts(); - GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); - GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); + GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1); + GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2); bool VariableIdxFound = false; @@ -141,12 +141,12 @@ struct MemsetRange { /// TheStores - The actual stores that make up this range. SmallVector<Instruction*, 16> TheStores; - bool isProfitableToUseMemset(const TargetData &TD) const; + bool isProfitableToUseMemset(const DataLayout &TD) const; }; } // end anon namespace -bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { +bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { // If we found more than 4 stores to merge or 16 bytes, use memset. if (TheStores.size() >= 4 || End-Start >= 16) return true; @@ -192,9 +192,9 @@ class MemsetRanges { /// because each element is relatively large and expensive to copy. std::list<MemsetRange> Ranges; typedef std::list<MemsetRange>::iterator range_iterator; - const TargetData &TD; + const DataLayout &TD; public: - MemsetRanges(const TargetData &td) : TD(td) {} + MemsetRanges(const DataLayout &td) : TD(td) {} typedef std::list<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } @@ -302,7 +302,7 @@ namespace { class MemCpyOpt : public FunctionPass { MemoryDependenceAnalysis *MD; TargetLibraryInfo *TLI; - const TargetData *TD; + const DataLayout *TD; public: static char ID; // Pass identification, replacement for typeid MemCpyOpt() : FunctionPass(ID) { @@ -332,7 +332,7 @@ namespace { bool processMemCpy(MemCpyInst *M); bool processMemMove(MemMoveInst *M); bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, - uint64_t cpyLen, CallInst *C); + uint64_t cpyLen, unsigned cpyAlign, CallInst *C); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize); bool processByValArgument(CallSite CS, unsigned ArgNo); @@ -509,10 +509,18 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { } if (C) { + unsigned storeAlign = SI->getAlignment(); + if (!storeAlign) + storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType()); + unsigned loadAlign = LI->getAlignment(); + if (!loadAlign) + loadAlign = TD->getABITypeAlignment(LI->getType()); + bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), - TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); + TD->getTypeStoreSize(SI->getOperand(0)->getType()), + std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -559,7 +567,8 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { /// the call write its result directly into the destination of the memcpy. bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, Value *cpySrc, - uint64_t cpyLen, CallInst *C) { + uint64_t cpyLen, unsigned cpyAlign, + CallInst *C) { // The general transformation to keep in mind is // // call @func(..., src, ...) @@ -625,6 +634,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; } + // Check that dest points to memory that is at least as aligned as src. + unsigned srcAlign = srcAlloca->getAlignment(); + if (!srcAlign) + srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType()); + bool isDestSufficientlyAligned = srcAlign <= cpyAlign; + // If dest is not aligned enough and we can't increase its alignment then + // bail out. + if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest)) + return false; + // Check that src is not accessed except via the call and the memcpy. This // guarantees that it holds only undefined values when passed in (so the final // memcpy can be dropped), that it is not read or written between the call and @@ -673,20 +692,26 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, bool changedArgument = false; for (unsigned i = 0; i < CS.arg_size(); ++i) if (CS.getArgument(i)->stripPointerCasts() == cpySrc) { - if (cpySrc->getType() != cpyDest->getType()) - cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), - cpyDest->getName(), C); + Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest + : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), + cpyDest->getName(), C); changedArgument = true; - if (CS.getArgument(i)->getType() == cpyDest->getType()) - CS.setArgument(i, cpyDest); + if (CS.getArgument(i)->getType() == Dest->getType()) + CS.setArgument(i, Dest); else - CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, - CS.getArgument(i)->getType(), cpyDest->getName(), C)); + CS.setArgument(i, CastInst::CreatePointerCast(Dest, + CS.getArgument(i)->getType(), Dest->getName(), C)); } if (!changedArgument) return false; + // If the destination wasn't sufficiently aligned then increase its alignment. + if (!isDestSufficientlyAligned) { + assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!"); + cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); + } + // Drop any cached information about the call, because we may have changed // its dependence information by changing its parameter. MD->removeInstruction(C); @@ -813,7 +838,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), - CopySize->getZExtValue(), C)) { + CopySize->getZExtValue(), M->getAlignment(), + C)) { MD->removeInstruction(M); M->eraseFromParent(); return true; @@ -974,7 +1000,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { bool MemCpyOpt::runOnFunction(Function &F) { bool MadeChange = false; MD = &getAnalysis<MemoryDependenceAnalysis>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); // If we don't have at least memset and memcpy, there is little point of doing diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp index dce8e8b..e6ec841 100644 --- a/lib/Transforms/Scalar/ObjCARC.cpp +++ b/lib/Transforms/Scalar/ObjCARC.cpp @@ -29,8 +29,10 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "objc-arc" -#include "llvm/Support/CommandLine.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; // A handy option to enable/disable all optimizations in this file. @@ -131,12 +133,12 @@ namespace { // ARC Utilities. //===----------------------------------------------------------------------===// -#include "llvm/Intrinsics.h" -#include "llvm/Module.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" #include "llvm/Support/CallSite.h" -#include "llvm/ADT/StringSwitch.h" +#include "llvm/Transforms/Utils/Local.h" namespace { /// InstructionClass - A simple classification for instructions. @@ -659,9 +661,9 @@ static bool DoesObjCBlockEscape(const Value *BlockPtr) { // ARC AliasAnalysis. //===----------------------------------------------------------------------===// -#include "llvm/Pass.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Pass.h" namespace { /// ObjCARCAliasAnalysis - This is a simple alias analysis @@ -885,25 +887,33 @@ bool ObjCARCExpand::runOnFunction(Function &F) { for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) { Instruction *Inst = &*I; + DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n"); + switch (GetBasicInstructionClass(Inst)) { case IC_Retain: case IC_RetainRV: case IC_Autorelease: case IC_AutoreleaseRV: case IC_FusedRetainAutorelease: - case IC_FusedRetainAutoreleaseRV: + case IC_FusedRetainAutoreleaseRV: { // These calls return their argument verbatim, as a low-level // optimization. However, this makes high-level optimizations // harder. Undo any uses of this optimization that the front-end // emitted here. We'll redo them in the contract pass. Changed = true; - Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0)); + Value *Value = cast<CallInst>(Inst)->getArgOperand(0); + DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n" + " New = " << *Value << "\n"); + Inst->replaceAllUsesWith(Value); break; + } default: break; } } + DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n"); + return Changed; } @@ -911,8 +921,8 @@ bool ObjCARCExpand::runOnFunction(Function &F) { // ARC autorelease pool elimination. //===----------------------------------------------------------------------===// -#include "llvm/Constants.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" namespace { /// ObjCARCAPElim - Autorelease pool elimination. @@ -985,6 +995,9 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { // zap the pair. if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) { Changed = true; + DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop autorelease pair:\n" + << " Pop: " << *Inst << "\n" + << " Push: " << *Push << "\n"); Inst->eraseFromParent(); Push->eraseFromParent(); } @@ -1092,10 +1105,10 @@ bool ObjCARCAPElim::runOnModule(Module &M) { // TODO: Delete release+retain pairs (rare). -#include "llvm/LLVMContext.h" -#include "llvm/Support/CFG.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CFG.h" STATISTIC(NumNoops, "Number of no-op objc calls eliminated"); STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated"); @@ -1120,9 +1133,8 @@ namespace { bool relatedSelect(const SelectInst *A, const Value *B); bool relatedPHI(const PHINode *A, const Value *B); - // Do not implement. - void operator=(const ProvenanceAnalysis &); - ProvenanceAnalysis(const ProvenanceAnalysis &); + void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; + ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; public: ProvenanceAnalysis() {} @@ -1597,6 +1609,12 @@ void BBState::MergePred(const BBState &Other) { // loop backedge. Loop backedges are special. TopDownPathCount += Other.TopDownPathCount; + // Check for overflow. If we have overflow, fall back to conservative behavior. + if (TopDownPathCount < Other.TopDownPathCount) { + clearTopDownPointers(); + return; + } + // For each entry in the other set, if our set has an entry with the same key, // merge the entries. Otherwise, copy the entry and merge it with an empty // entry. @@ -1622,6 +1640,12 @@ void BBState::MergeSucc(const BBState &Other) { // loop backedge. Loop backedges are special. BottomUpPathCount += Other.BottomUpPathCount; + // Check for overflow. If we have overflow, fall back to conservative behavior. + if (BottomUpPathCount < Other.BottomUpPathCount) { + clearBottomUpPointers(); + return; + } + // For each entry in the other set, if our set has an entry with the // same key, merge the entries. Otherwise, copy the entry and merge // it with an empty entry. @@ -1776,10 +1800,12 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); RetainRVCallee = M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy, - Attributes); + Attribute); } return RetainRVCallee; } @@ -1790,10 +1816,12 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); AutoreleaseRVCallee = M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy, - Attributes); + Attribute); } return AutoreleaseRVCallee; } @@ -1802,12 +1830,14 @@ Constant *ObjCARCOpt::getReleaseCallee(Module *M) { if (!ReleaseCallee) { LLVMContext &C = M->getContext(); Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); ReleaseCallee = M->getOrInsertFunction( "objc_release", FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), - Attributes); + Attribute); } return ReleaseCallee; } @@ -1816,12 +1846,14 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) { if (!RetainCallee) { LLVMContext &C = M->getContext(); Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); RetainCallee = M->getOrInsertFunction( "objc_retain", FunctionType::get(Params[0], Params, /*isVarArg=*/false), - Attributes); + Attribute); } return RetainCallee; } @@ -1836,7 +1868,7 @@ Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { M->getOrInsertFunction( "objc_retainBlock", FunctionType::get(Params[0], Params, /*isVarArg=*/false), - AttrListPtr()); + AttributeSet()); } return RetainBlockCallee; } @@ -1845,12 +1877,14 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { if (!AutoreleaseCallee) { LLVMContext &C = M->getContext(); Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); AutoreleaseCallee = M->getOrInsertFunction( "objc_autorelease", FunctionType::get(Params[0], Params, /*isVarArg=*/false), - Attributes); + Attribute); } return AutoreleaseCallee; } @@ -2165,7 +2199,17 @@ ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) { // Turn it to an objc_retainAutoreleasedReturnValue.. Changed = true; ++NumPeeps; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming " + "objc_retainAutoreleasedReturnValue => " + "objc_retain since the operand is not a return value.\n" + " Old: " + << *Retain << "\n"); + cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent())); + + DEBUG(dbgs() << " New: " + << *Retain << "\n"); } /// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into @@ -2203,6 +2247,11 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { GetObjCArg(I) == Arg) { Changed = true; ++NumPeeps; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n" + << " Erasing " << *RetainRV + << "\n"); + EraseInstruction(I); EraseInstruction(RetainRV); return true; @@ -2212,7 +2261,18 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Turn it to a plain objc_retain. Changed = true; ++NumPeeps; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming " + "objc_retainAutoreleasedReturnValue => " + "objc_retain since the operand is not a return value.\n" + " Old: " + << *RetainRV << "\n"); + cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent())); + + DEBUG(dbgs() << " New: " + << *RetainRV << "\n"); + return false; } @@ -2238,8 +2298,20 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) { Changed = true; ++NumPeeps; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming " + "objc_autoreleaseReturnValue => " + "objc_autorelease since its operand is not used as a return " + "value.\n" + " Old: " + << *AutoreleaseRV << "\n"); + cast<CallInst>(AutoreleaseRV)-> setCalledFunction(getAutoreleaseCallee(F.getParent())); + + DEBUG(dbgs() << " New: " + << *AutoreleaseRV << "\n"); + } /// OptimizeIndividualCalls - Visit each call, one at a time, and make @@ -2251,6 +2323,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // Visit all objc_* calls in F. for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: " << + *Inst << "\n"); + InstructionClass Class = GetBasicInstructionClass(Inst); switch (Class) { @@ -2267,6 +2343,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { case IC_NoopCast: Changed = true; ++NumNoops; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:" + " " << *Inst << "\n"); EraseInstruction(Inst); continue; @@ -2283,7 +2361,13 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), Constant::getNullValue(Ty), CI); - CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + llvm::Value *NewValue = UndefValue::get(CI->getType()); + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null " + "pointer-to-weak-pointer is undefined behavior.\n" + " Old = " << *CI << + "\n New = " << + *NewValue << "\n"); + CI->replaceAllUsesWith(NewValue); CI->eraseFromParent(); continue; } @@ -2299,7 +2383,15 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), Constant::getNullValue(Ty), CI); - CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + + llvm::Value *NewValue = UndefValue::get(CI->getType()); + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null " + "pointer-to-weak-pointer is undefined behavior.\n" + " Old = " << *CI << + "\n New = " << + *NewValue << "\n"); + + CI->replaceAllUsesWith(NewValue); CI->eraseFromParent(); continue; } @@ -2333,6 +2425,14 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { Call->getArgOperand(0), "", Call); NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, ArrayRef<Value *>())); + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing " + "objc_autorelease(x) with objc_release(x) since x is " + "otherwise unused.\n" + " Old: " << *Call << + "\n New: " << + *NewCall << "\n"); + EraseInstruction(Call); Inst = NewCall; Class = IC_Release; @@ -2343,12 +2443,17 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // a tail keyword. if (IsAlwaysTail(Class)) { Changed = true; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword" + " to function since it can never be passed stack args: " << *Inst << + "\n"); cast<CallInst>(Inst)->setTailCall(); } // Set nounwind as needed. if (IsNoThrow(Class)) { Changed = true; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw" + " class. Setting nounwind on: " << *Inst << "\n"); cast<CallInst>(Inst)->setDoesNotThrow(); } @@ -2363,6 +2468,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { if (isNullOrUndef(Arg)) { Changed = true; ++NumNoops; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with " + " null are no-ops. Erasing: " << *Inst << "\n"); EraseInstruction(Inst); continue; } @@ -2464,6 +2571,9 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { } } } while (!Worklist.empty()); + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished Queue.\n\n"); + } } @@ -3367,6 +3477,10 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { // queries instead. for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst << + "\n"); + InstructionClass Class = GetBasicInstructionClass(Inst); if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained) continue; @@ -3512,6 +3626,9 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { done:; } } + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n"); + } /// OptimizeSequences - Identify program paths which execute sequences of @@ -3537,19 +3654,19 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) { } /// OptimizeReturns - Look for this pattern: -/// +/// \code /// %call = call i8* @something(...) /// %2 = call i8* @objc_retain(i8* %call) /// %3 = call i8* @objc_autorelease(i8* %2) /// ret i8* %3 -/// +/// \endcode /// And delete the retain and autorelease. /// /// Otherwise if it's just this: -/// +/// \code /// %3 = call i8* @objc_autorelease(i8* %2) /// ret i8* %3 -/// +/// \endcode /// convert the autorelease to autoreleaseRV. void ObjCARCOpt::OptimizeReturns(Function &F) { if (!F.getReturnType()->isPointerTy()) @@ -3560,6 +3677,9 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { BasicBlock *BB = FI; ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back()); + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n"); + if (!Ret) continue; const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0)); @@ -3633,6 +3753,9 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { // If so, we can zap the retain and autorelease. Changed = true; ++NumRets; + DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain + << "\n Erasing: " + << *Autorelease << "\n"); EraseInstruction(Retain); EraseInstruction(Autorelease); } @@ -3643,6 +3766,9 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { DependingInstructions.clear(); Visited.clear(); } + + DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n"); + } bool ObjCARCOpt::doInitialization(Module &M) { @@ -3734,9 +3860,9 @@ void ObjCARCOpt::releaseMemory() { // TODO: ObjCARCContract could insert PHI nodes when uses aren't // dominated by single calls. -#include "llvm/Operator.h" -#include "llvm/InlineAsm.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Operator.h" STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed"); @@ -3818,15 +3944,16 @@ Constant *ObjCARCContract::getStoreStrongCallee(Module *M) { Type *I8XX = PointerType::getUnqual(I8X); Type *Params[] = { I8XX, I8X }; - AttrListPtr Attributes = AttrListPtr() - .addAttr(~0u, Attribute::NoUnwind) - .addAttr(1, Attribute::NoCapture); + AttributeSet Attribute = AttributeSet() + .addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)) + .addAttr(M->getContext(), 1, Attribute::get(C, Attribute::NoCapture)); StoreStrongCallee = M->getOrInsertFunction( "objc_storeStrong", FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), - Attributes); + Attribute); } return StoreStrongCallee; } @@ -3837,9 +3964,11 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); RetainAutoreleaseCallee = - M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes); + M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute); } return RetainAutoreleaseCallee; } @@ -3850,10 +3979,12 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttributeSet Attribute = + AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex, + Attribute::get(C, Attribute::NoUnwind)); RetainAutoreleaseRVCallee = M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy, - Attributes); + Attribute); } return RetainAutoreleaseRVCallee; } @@ -3897,11 +4028,19 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, Changed = true; ++NumPeeps; + DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing " + "retain/autorelease. Erasing: " << *Autorelease << "\n" + " Old Retain: " + << *Retain << "\n"); + if (Class == IC_AutoreleaseRV) Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent())); else Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent())); + DEBUG(dbgs() << " New Retain: " + << *Retain << "\n"); + EraseInstruction(Autorelease); return true; } @@ -4052,6 +4191,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; + DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n"); + // Only these library routines return their argument. In particular, // objc_retainBlock does not necessarily return its argument. InstructionClass Class = GetBasicInstructionClass(Inst); @@ -4089,6 +4230,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { } while (isNoopInstruction(BBI)); if (&*BBI == GetObjCArg(Inst)) { + DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for " + "retainAutoreleasedReturnValue optimization.\n"); Changed = true; InlineAsm *IA = InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()), @@ -4108,6 +4251,10 @@ bool ObjCARCContract::runOnFunction(Function &F) { ConstantPointerNull::get(cast<PointerType>(CI->getType())); Changed = true; new StoreInst(Null, CI->getArgOperand(0), CI); + + DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n" + << " New = " << *Null << "\n"); + CI->replaceAllUsesWith(Null); CI->eraseFromParent(); } @@ -4127,6 +4274,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { continue; } + DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n"); + // Don't use GetObjCArg because we don't want to look through bitcasts // and such; to do the replacement, the argument must have type i8*. const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0); diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 09687d8..0da3746 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -22,24 +22,24 @@ #define DEBUG_TYPE "reassociate" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Assembly/Writer.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; @@ -339,36 +339,6 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { } } -/// EvaluateRepeatedConstant - Compute C op C op ... op C where the constant C -/// is repeated Weight times. -static Constant *EvaluateRepeatedConstant(unsigned Opcode, Constant *C, - APInt Weight) { - // For addition the result can be efficiently computed as the product of the - // constant and the weight. - if (Opcode == Instruction::Add) - return ConstantExpr::getMul(C, ConstantInt::get(C->getContext(), Weight)); - - // The weight might be huge, so compute by repeated squaring to ensure that - // compile time is proportional to the logarithm of the weight. - Constant *Result = 0; - Constant *Power = C; // Successively C, C op C, (C op C) op (C op C) etc. - // Visit the bits in Weight. - while (Weight != 0) { - // If the current bit in Weight is non-zero do Result = Result op Power. - if (Weight[0]) - Result = Result ? ConstantExpr::get(Opcode, Result, Power) : Power; - // Move on to the next bit if any more are non-zero. - Weight = Weight.lshr(1); - if (Weight.isMinValue()) - break; - // Square the power. - Power = ConstantExpr::get(Opcode, Power, Power); - } - - assert(Result && "Only positive weights supported!"); - return Result; -} - typedef std::pair<Value*, APInt> RepeatedValue; /// LinearizeExprTree - Given an associative binary expression, return the leaf @@ -382,9 +352,7 @@ typedef std::pair<Value*, APInt> RepeatedValue; /// op /// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times /// -/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct, and -/// they are all non-constant except possibly for the last one, which if it is -/// constant will have weight one (Ops[N].second === 1). +/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct. /// /// This routine may modify the function, in which case it returns 'true'. The /// changes it makes may well be destructive, changing the value computed by 'I' @@ -455,10 +423,6 @@ static bool LinearizeExprTree(BinaryOperator *I, assert(Instruction::isAssociative(Opcode) && Instruction::isCommutative(Opcode) && "Expected an associative and commutative operation!"); - // If we see an absorbing element then the entire expression must be equal to - // it. For example, if this is a multiplication expression and zero occurs as - // an operand somewhere in it then the result of the expression must be zero. - Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType()); // Visit all operands of the expression, keeping track of their weight (the // number of paths from the expression root to the operand, or if you like @@ -506,13 +470,6 @@ static bool LinearizeExprTree(BinaryOperator *I, DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); assert(!Op->use_empty() && "No uses, so how did we get to it?!"); - // If the expression contains an absorbing element then there is no need - // to analyze it further: it must evaluate to the absorbing element. - if (Op == Absorber && !Weight.isMinValue()) { - Ops.push_back(std::make_pair(Absorber, APInt(Bitwidth, 1))); - return MadeChange; - } - // If this is a binary operation of the right kind with only one use then // add its operands to the expression. if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { @@ -604,7 +561,6 @@ static bool LinearizeExprTree(BinaryOperator *I, // The leaves, repeated according to their weights, represent the linearized // form of the expression. - Constant *Cst = 0; // Accumulate constants here. for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) { Value *V = LeafOrder[i]; LeafMap::iterator It = Leaves.find(V); @@ -618,31 +574,14 @@ static bool LinearizeExprTree(BinaryOperator *I, continue; // Ensure the leaf is only output once. It->second = 0; - // Glob all constants together into Cst. - if (Constant *C = dyn_cast<Constant>(V)) { - C = EvaluateRepeatedConstant(Opcode, C, Weight); - Cst = Cst ? ConstantExpr::get(Opcode, Cst, C) : C; - continue; - } - // Add non-constant Ops.push_back(std::make_pair(V, Weight)); } - // Add any constants back into Ops, all globbed together and reduced to having - // weight 1 for the convenience of users. - Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); - if (Cst && Cst != Identity) { - // If combining multiple constants resulted in the absorber then the entire - // expression must evaluate to the absorber. - if (Cst == Absorber) - Ops.clear(); - Ops.push_back(std::make_pair(Cst, APInt(Bitwidth, 1))); - } - // For nilpotent operations or addition there may be no operands, for example // because the expression was "X xor X" or consisted of 2^Bitwidth additions: // in both cases the weight reduces to 0 causing the value to be skipped. if (Ops.empty()) { + Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); assert(Identity && "Associative operation without identity!"); Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1))); } @@ -656,8 +595,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { assert(Ops.size() > 1 && "Single values should be used directly!"); - // Since our optimizations never increase the number of operations, the new - // expression can always be written by reusing the existing binary operators + // Since our optimizations should never increase the number of operations, the + // new expression can usually be written reusing the existing binary operators // from the original expression tree, without creating any new instructions, // though the rewritten expression may have a completely different topology. // We take care to not change anything if the new expression will be the same @@ -671,6 +610,20 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, unsigned Opcode = I->getOpcode(); BinaryOperator *Op = I; + /// NotRewritable - The operands being written will be the leaves of the new + /// expression and must not be used as inner nodes (via NodesToRewrite) by + /// mistake. Inner nodes are always reassociable, and usually leaves are not + /// (if they were they would have been incorporated into the expression and so + /// would not be leaves), so most of the time there is no danger of this. But + /// in rare cases a leaf may become reassociable if an optimization kills uses + /// of it, or it may momentarily become reassociable during rewriting (below) + /// due it being removed as an operand of one of its uses. Ensure that misuse + /// of leaf nodes as inner nodes cannot occur by remembering all of the future + /// leaves and refusing to reuse any of them as inner nodes. + SmallPtrSet<Value*, 8> NotRewritable; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + NotRewritable.insert(Ops[i].Op); + // ExpressionChanged - Non-null if the rewritten expression differs from the // original in some non-trivial way, requiring the clearing of optional flags. // Flags are cleared from the operator in ExpressionChanged up to I inclusive. @@ -703,12 +656,14 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, // the old operands with the new ones. DEBUG(dbgs() << "RA: " << *Op << '\n'); if (NewLHS != OldLHS) { - if (BinaryOperator *BO = isReassociableOp(OldLHS, Opcode)) + BinaryOperator *BO = isReassociableOp(OldLHS, Opcode); + if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(0, NewLHS); } if (NewRHS != OldRHS) { - if (BinaryOperator *BO = isReassociableOp(OldRHS, Opcode)) + BinaryOperator *BO = isReassociableOp(OldRHS, Opcode); + if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(1, NewRHS); } @@ -732,7 +687,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, Op->swapOperands(); } else { // Overwrite with the new right-hand side. - if (BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode)) + BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode); + if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(1, NewRHS); ExpressionChanged = Op; @@ -745,7 +701,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, // Now deal with the left-hand side. If this is already an operation node // from the original expression then just rewrite the rest of the expression // into it. - if (BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode)) { + BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode); + if (BO && !NotRewritable.count(BO)) { Op = BO; continue; } @@ -1446,9 +1403,26 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. - if (Ops.size() == 1) return Ops[0].Op; - + Constant *Cst = 0; unsigned Opcode = I->getOpcode(); + while (!Ops.empty() && isa<Constant>(Ops.back().Op)) { + Constant *C = cast<Constant>(Ops.pop_back_val().Op); + Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C; + } + // If there was nothing but constants then we are done. + if (Ops.empty()) + return Cst; + + // Put the combined constant back at the end of the operand list, except if + // there is no point. For example, an add of 0 gets dropped here, while a + // multiplication by zero turns the whole expression into zero. + if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) { + if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType())) + return Cst; + Ops.push_back(ValueEntry(0, Cst)); + } + + if (Ops.size() == 1) return Ops[0].Op; // Handle destructive annihilation due to identities between elements in the // argument list here. diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index ea1de63..07f540a 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -18,15 +18,15 @@ #define DEBUG_TYPE "reg2mem" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Pass.h" -#include "llvm/Function.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/BasicBlock.h" -#include "llvm/Instructions.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 2c39aab..3e935d8 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -19,26 +19,26 @@ #define DEBUG_TYPE "sccp" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/IPO.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/Pass.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/InstVisitor.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/InstVisitor.h" +#include "llvm/Pass.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; @@ -153,7 +153,7 @@ namespace { /// Constant Propagation. /// class SCCPSolver : public InstVisitor<SCCPSolver> { - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. @@ -205,7 +205,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> { typedef std::pair<BasicBlock*, BasicBlock*> Edge; DenseSet<Edge> KnownFeasibleEdges; public: - SCCPSolver(const TargetData *td, const TargetLibraryInfo *tli) + SCCPSolver(const DataLayout *td, const TargetLibraryInfo *tli) : TD(td), TLI(tli) {} /// MarkBlockExecutable - This method can be used by clients to mark all of @@ -1564,7 +1564,7 @@ static void DeleteInstructionInBlock(BasicBlock *BB) { // bool SCCP::runOnFunction(Function &F) { DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SCCPSolver Solver(TD, TLI); @@ -1693,7 +1693,7 @@ static bool AddressIsTaken(const GlobalValue *GV) { } bool IPSCCP::runOnModule(Module &M) { - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SCCPSolver Solver(TD, TLI); diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp new file mode 100644 index 0000000..4204171 --- /dev/null +++ b/lib/Transforms/Scalar/SROA.cpp @@ -0,0 +1,3711 @@ +//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This transformation implements the well known scalar replacement of +/// aggregates transformation. It tries to identify promotable elements of an +/// aggregate alloca, and promote them to registers. It will also try to +/// convert uses of an element (or set of elements) of an alloca into a vector +/// or bitfield-style integer scalar if appropriate. +/// +/// It works to do this with minimal slicing of the alloca so that regions +/// which are merely transferred in and out of external memory remain unchanged +/// and are not decomposed to scalar code. +/// +/// Because this also performs alloca promotion, it can be thought of as also +/// serving the purpose of SSA formation. The algorithm iterates on the +/// function until all opportunities for promotion have been realized. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sroa" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/PtrUseVisitor.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/InstVisitor.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +using namespace llvm; + +STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); +STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); +STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); +STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); +STATISTIC(NumDeleted, "Number of instructions deleted"); +STATISTIC(NumVectorized, "Number of vectorized aggregates"); + +/// Hidden option to force the pass to not use DomTree and mem2reg, instead +/// forming SSA values through the SSAUpdater infrastructure. +static cl::opt<bool> +ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); + +namespace { +/// \brief Alloca partitioning representation. +/// +/// This class represents a partitioning of an alloca into slices, and +/// information about the nature of uses of each slice of the alloca. The goal +/// is that this information is sufficient to decide if and how to split the +/// alloca apart and replace slices with scalars. It is also intended that this +/// structure can capture the relevant information needed both to decide about +/// and to enact these transformations. +class AllocaPartitioning { +public: + /// \brief A common base class for representing a half-open byte range. + struct ByteRange { + /// \brief The beginning offset of the range. + uint64_t BeginOffset; + + /// \brief The ending offset, not included in the range. + uint64_t EndOffset; + + ByteRange() : BeginOffset(), EndOffset() {} + ByteRange(uint64_t BeginOffset, uint64_t EndOffset) + : BeginOffset(BeginOffset), EndOffset(EndOffset) {} + + /// \brief Support for ordering ranges. + /// + /// This provides an ordering over ranges such that start offsets are + /// always increasing, and within equal start offsets, the end offsets are + /// decreasing. Thus the spanning range comes first in a cluster with the + /// same start position. + bool operator<(const ByteRange &RHS) const { + if (BeginOffset < RHS.BeginOffset) return true; + if (BeginOffset > RHS.BeginOffset) return false; + if (EndOffset > RHS.EndOffset) return true; + return false; + } + + /// \brief Support comparison with a single offset to allow binary searches. + friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { + return LHS.BeginOffset < RHSOffset; + } + + friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, + const ByteRange &RHS) { + return LHSOffset < RHS.BeginOffset; + } + + bool operator==(const ByteRange &RHS) const { + return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; + } + bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } + }; + + /// \brief A partition of an alloca. + /// + /// This structure represents a contiguous partition of the alloca. These are + /// formed by examining the uses of the alloca. During formation, they may + /// overlap but once an AllocaPartitioning is built, the Partitions within it + /// are all disjoint. + struct Partition : public ByteRange { + /// \brief Whether this partition is splittable into smaller partitions. + /// + /// We flag partitions as splittable when they are formed entirely due to + /// accesses by trivially splittable operations such as memset and memcpy. + bool IsSplittable; + + /// \brief Test whether a partition has been marked as dead. + bool isDead() const { + if (BeginOffset == UINT64_MAX) { + assert(EndOffset == UINT64_MAX); + return true; + } + return false; + } + + /// \brief Kill a partition. + /// This is accomplished by setting both its beginning and end offset to + /// the maximum possible value. + void kill() { + assert(!isDead() && "He's Dead, Jim!"); + BeginOffset = EndOffset = UINT64_MAX; + } + + Partition() : ByteRange(), IsSplittable() {} + Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) + : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} + }; + + /// \brief A particular use of a partition of the alloca. + /// + /// This structure is used to associate uses of a partition with it. They + /// mark the range of bytes which are referenced by a particular instruction, + /// and includes a handle to the user itself and the pointer value in use. + /// The bounds of these uses are determined by intersecting the bounds of the + /// memory use itself with a particular partition. As a consequence there is + /// intentionally overlap between various uses of the same partition. + struct PartitionUse : public ByteRange { + /// \brief The use in question. Provides access to both user and used value. + /// + /// Note that this may be null if the partition use is *dead*, that is, it + /// should be ignored. + Use *U; + + PartitionUse() : ByteRange(), U() {} + PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U) + : ByteRange(BeginOffset, EndOffset), U(U) {} + }; + + /// \brief Construct a partitioning of a particular alloca. + /// + /// Construction does most of the work for partitioning the alloca. This + /// performs the necessary walks of users and builds a partitioning from it. + AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); + + /// \brief Test whether a pointer to the allocation escapes our analysis. + /// + /// If this is true, the partitioning is never fully built and should be + /// ignored. + bool isEscaped() const { return PointerEscapingInstr; } + + /// \brief Support for iterating over the partitions. + /// @{ + typedef SmallVectorImpl<Partition>::iterator iterator; + iterator begin() { return Partitions.begin(); } + iterator end() { return Partitions.end(); } + + typedef SmallVectorImpl<Partition>::const_iterator const_iterator; + const_iterator begin() const { return Partitions.begin(); } + const_iterator end() const { return Partitions.end(); } + /// @} + + /// \brief Support for iterating over and manipulating a particular + /// partition's uses. + /// + /// The iteration support provided for uses is more limited, but also + /// includes some manipulation routines to support rewriting the uses of + /// partitions during SROA. + /// @{ + typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; + use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } + use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } + use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } + use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } + + typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; + const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } + const_use_iterator use_begin(const_iterator I) const { + return Uses[I - begin()].begin(); + } + const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } + const_use_iterator use_end(const_iterator I) const { + return Uses[I - begin()].end(); + } + + unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } + unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } + const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { + return Uses[PIdx][UIdx]; + } + const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { + return Uses[I - begin()][UIdx]; + } + + void use_push_back(unsigned Idx, const PartitionUse &PU) { + Uses[Idx].push_back(PU); + } + void use_push_back(const_iterator I, const PartitionUse &PU) { + Uses[I - begin()].push_back(PU); + } + /// @} + + /// \brief Allow iterating the dead users for this alloca. + /// + /// These are instructions which will never actually use the alloca as they + /// are outside the allocated range. They are safe to replace with undef and + /// delete. + /// @{ + typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; + dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } + dead_user_iterator dead_user_end() const { return DeadUsers.end(); } + /// @} + + /// \brief Allow iterating the dead expressions referring to this alloca. + /// + /// These are operands which have cannot actually be used to refer to the + /// alloca as they are outside its range and the user doesn't correct for + /// that. These mostly consist of PHI node inputs and the like which we just + /// need to replace with undef. + /// @{ + typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; + dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } + dead_op_iterator dead_op_end() const { return DeadOperands.end(); } + /// @} + + /// \brief MemTransferInst auxiliary data. + /// This struct provides some auxiliary data about memory transfer + /// intrinsics such as memcpy and memmove. These intrinsics can use two + /// different ranges within the same alloca, and provide other challenges to + /// correctly represent. We stash extra data to help us untangle this + /// after the partitioning is complete. + struct MemTransferOffsets { + /// The destination begin and end offsets when the destination is within + /// this alloca. If the end offset is zero the destination is not within + /// this alloca. + uint64_t DestBegin, DestEnd; + + /// The source begin and end offsets when the source is within this alloca. + /// If the end offset is zero, the source is not within this alloca. + uint64_t SourceBegin, SourceEnd; + + /// Flag for whether an alloca is splittable. + bool IsSplittable; + }; + MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { + return MemTransferInstData.lookup(&II); + } + + /// \brief Map from a PHI or select operand back to a partition. + /// + /// When manipulating PHI nodes or selects, they can use more than one + /// partition of an alloca. We store a special mapping to allow finding the + /// partition referenced by each of these operands, if any. + iterator findPartitionForPHIOrSelectOperand(Use *U) { + SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt + = PHIOrSelectOpMap.find(U); + if (MapIt == PHIOrSelectOpMap.end()) + return end(); + + return begin() + MapIt->second.first; + } + + /// \brief Map from a PHI or select operand back to the specific use of + /// a partition. + /// + /// Similar to mapping these operands back to the partitions, this maps + /// directly to the use structure of that partition. + use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { + SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt + = PHIOrSelectOpMap.find(U); + assert(MapIt != PHIOrSelectOpMap.end()); + return Uses[MapIt->second.first].begin() + MapIt->second.second; + } + + /// \brief Compute a common type among the uses of a particular partition. + /// + /// This routines walks all of the uses of a particular partition and tries + /// to find a common type between them. Untyped operations such as memset and + /// memcpy are ignored. + Type *getCommonType(iterator I) const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; + void printUsers(raw_ostream &OS, const_iterator I, + StringRef Indent = " ") const; + void print(raw_ostream &OS) const; + void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; + void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; +#endif + +private: + template <typename DerivedT, typename RetT = void> class BuilderBase; + class PartitionBuilder; + friend class AllocaPartitioning::PartitionBuilder; + class UseBuilder; + friend class AllocaPartitioning::UseBuilder; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// \brief Handle to alloca instruction to simplify method interfaces. + AllocaInst &AI; +#endif + + /// \brief The instruction responsible for this alloca having no partitioning. + /// + /// When an instruction (potentially) escapes the pointer to the alloca, we + /// store a pointer to that here and abort trying to partition the alloca. + /// This will be null if the alloca is partitioned successfully. + Instruction *PointerEscapingInstr; + + /// \brief The partitions of the alloca. + /// + /// We store a vector of the partitions over the alloca here. This vector is + /// sorted by increasing begin offset, and then by decreasing end offset. See + /// the Partition inner class for more details. Initially (during + /// construction) there are overlaps, but we form a disjoint sequence of + /// partitions while finishing construction and a fully constructed object is + /// expected to always have this as a disjoint space. + SmallVector<Partition, 8> Partitions; + + /// \brief The uses of the partitions. + /// + /// This is essentially a mapping from each partition to a list of uses of + /// that partition. The mapping is done with a Uses vector that has the exact + /// same number of entries as the partition vector. Each entry is itself + /// a vector of the uses. + SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; + + /// \brief Instructions which will become dead if we rewrite the alloca. + /// + /// Note that these are not separated by partition. This is because we expect + /// a partitioned alloca to be completely rewritten or not rewritten at all. + /// If rewritten, all these instructions can simply be removed and replaced + /// with undef as they come from outside of the allocated space. + SmallVector<Instruction *, 8> DeadUsers; + + /// \brief Operands which will become dead if we rewrite the alloca. + /// + /// These are operands that in their particular use can be replaced with + /// undef when we rewrite the alloca. These show up in out-of-bounds inputs + /// to PHI nodes and the like. They aren't entirely dead (there might be + /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we + /// want to swap this particular input for undef to simplify the use lists of + /// the alloca. + SmallVector<Use *, 8> DeadOperands; + + /// \brief The underlying storage for auxiliary memcpy and memset info. + SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; + + /// \brief A side datastructure used when building up the partitions and uses. + /// + /// This mapping is only really used during the initial building of the + /// partitioning so that we can retain information about PHI and select nodes + /// processed. + SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; + + /// \brief Auxiliary information for particular PHI or select operands. + SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; + + /// \brief A utility routine called from the constructor. + /// + /// This does what it says on the tin. It is the key of the alloca partition + /// splitting and merging. After it is called we have the desired disjoint + /// collection of partitions. + void splitAndMergePartitions(); +}; +} + +static Value *foldSelectInst(SelectInst &SI) { + // If the condition being selected on is a constant or the same value is + // being selected between, fold the select. Yes this does (rarely) happen + // early on. + if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) + return SI.getOperand(1+CI->isZero()); + if (SI.getOperand(1) == SI.getOperand(2)) { + return SI.getOperand(1); + } + return 0; +} + +/// \brief Builder for the alloca partitioning. +/// +/// This class builds an alloca partitioning by recursively visiting the uses +/// of an alloca and splitting the partitions for each load and store at each +/// offset. +class AllocaPartitioning::PartitionBuilder + : public PtrUseVisitor<PartitionBuilder> { + friend class PtrUseVisitor<PartitionBuilder>; + friend class InstVisitor<PartitionBuilder>; + typedef PtrUseVisitor<PartitionBuilder> Base; + + const uint64_t AllocSize; + AllocaPartitioning &P; + + SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; + +public: + PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P) + : PtrUseVisitor<PartitionBuilder>(DL), + AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), + P(P) {} + +private: + void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, + bool IsSplittable = false) { + // Completely skip uses which have a zero size or start either before or + // past the end of the allocation. + if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) { + DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset + << " which has zero size or starts outside of the " + << AllocSize << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + return; + } + + uint64_t BeginOffset = Offset.getZExtValue(); + uint64_t EndOffset = BeginOffset + Size; + + // Clamp the end offset to the end of the allocation. Note that this is + // formulated to handle even the case where "BeginOffset + Size" overflows. + // NOTE! This may appear superficially to be something we could ignore + // entirely, but that is not so! There may be PHI-node uses where some + // instructions are dead but not others. We can't completely ignore the + // PHI node, and so have to record at least the information here. + assert(AllocSize >= BeginOffset); // Established above. + if (Size > AllocSize - BeginOffset) { + DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset + << " to remain within the " << AllocSize << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + EndOffset = AllocSize; + } + + Partition New(BeginOffset, EndOffset, IsSplittable); + P.Partitions.push_back(New); + } + + void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, + bool IsVolatile) { + uint64_t Size = DL.getTypeStoreSize(Ty); + + // If this memory access can be shown to *statically* extend outside the + // bounds of of the allocation, it's behavior is undefined, so simply + // ignore it. Note that this is more strict than the generic clamping + // behavior of insertUse. We also try to handle cases which might run the + // risk of overflow. + // FIXME: We should instead consider the pointer to have escaped if this + // function is being instrumented for addressing bugs or race conditions. + if (Offset.isNegative() || Size > AllocSize || + Offset.ugt(AllocSize - Size)) { + DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " + << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset + << " which extends past the end of the " << AllocSize + << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + return; + } + + // We allow splitting of loads and stores where the type is an integer type + // and which cover the entire alloca. Such integer loads and stores + // often require decomposition into fine grained loads and stores. + bool IsSplittable = false; + if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) + IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8; + + insertUse(I, Offset, Size, IsSplittable); + } + + void visitLoadInst(LoadInst &LI) { + assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && + "All simple FCA loads should have been pre-split"); + + if (!IsOffsetKnown) + return PI.setAborted(&LI); + + return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile()); + } + + void visitStoreInst(StoreInst &SI) { + Value *ValOp = SI.getValueOperand(); + if (ValOp == *U) + return PI.setEscapedAndAborted(&SI); + if (!IsOffsetKnown) + return PI.setAborted(&SI); + + assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && + "All simple FCA stores should have been pre-split"); + handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile()); + } + + + void visitMemSetInst(MemSetInst &II) { + assert(II.getRawDest() == *U && "Pointer use is not the destination?"); + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + if ((Length && Length->getValue() == 0) || + (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + // Zero-length mem transfer intrinsics can be ignored entirely. + return; + + if (!IsOffsetKnown) + return PI.setAborted(&II); + + insertUse(II, Offset, + Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(), + (bool)Length); + } + + void visitMemTransferInst(MemTransferInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + if ((Length && Length->getValue() == 0) || + (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + // Zero-length mem transfer intrinsics can be ignored entirely. + return; + + if (!IsOffsetKnown) + return PI.setAborted(&II); + + uint64_t RawOffset = Offset.getLimitedValue(); + uint64_t Size = Length ? Length->getLimitedValue() + : AllocSize - RawOffset; + + MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; + + // Only intrinsics with a constant length can be split. + Offsets.IsSplittable = Length; + + if (*U == II.getRawDest()) { + Offsets.DestBegin = RawOffset; + Offsets.DestEnd = RawOffset + Size; + } + if (*U == II.getRawSource()) { + Offsets.SourceBegin = RawOffset; + Offsets.SourceEnd = RawOffset + Size; + } + + // If we have set up end offsets for both the source and the destination, + // we have found both sides of this transfer pointing at the same alloca. + bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; + if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { + unsigned PrevIdx = MemTransferPartitionMap[&II]; + + // Check if the begin offsets match and this is a non-volatile transfer. + // In that case, we can completely elide the transfer. + if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { + P.Partitions[PrevIdx].kill(); + return; + } + + // Otherwise we have an offset transfer within the same alloca. We can't + // split those. + P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; + } else if (SeenBothEnds) { + // Handle the case where this exact use provides both ends of the + // operation. + assert(II.getRawDest() == II.getRawSource()); + + // For non-volatile transfers this is a no-op. + if (!II.isVolatile()) + return; + + // Otherwise just suppress splitting. + Offsets.IsSplittable = false; + } + + + // Insert the use now that we've fixed up the splittable nature. + insertUse(II, Offset, Size, Offsets.IsSplittable); + + // Setup the mapping from intrinsic to partition of we've not seen both + // ends of this transfer. + if (!SeenBothEnds) { + unsigned NewIdx = P.Partitions.size() - 1; + bool Inserted + = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; + assert(Inserted && + "Already have intrinsic in map but haven't seen both ends"); + (void)Inserted; + } + } + + // Disable SRoA for any intrinsics except for lifetime invariants. + // FIXME: What about debug instrinsics? This matches old behavior, but + // doesn't make sense. + void visitIntrinsicInst(IntrinsicInst &II) { + if (!IsOffsetKnown) + return PI.setAborted(&II); + + if (II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end) { + ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); + uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), + Length->getLimitedValue()); + insertUse(II, Offset, Size, true); + return; + } + + Base::visitIntrinsicInst(II); + } + + Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { + // We consider any PHI or select that results in a direct load or store of + // the same offset to be a viable use for partitioning purposes. These uses + // are considered unsplittable and the size is the maximum loaded or stored + // size. + SmallPtrSet<Instruction *, 4> Visited; + SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; + Visited.insert(Root); + Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); + // If there are no loads or stores, the access is dead. We mark that as + // a size zero access. + Size = 0; + do { + Instruction *I, *UsedI; + llvm::tie(UsedI, I) = Uses.pop_back_val(); + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); + continue; + } + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + Value *Op = SI->getOperand(0); + if (Op == UsedI) + return SI; + Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + if (!GEP->hasAllZeroIndices()) + return GEP; + } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && + !isa<SelectInst>(I)) { + return I; + } + + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; + ++UI) + if (Visited.insert(cast<Instruction>(*UI))) + Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); + } while (!Uses.empty()); + + return 0; + } + + void visitPHINode(PHINode &PN) { + if (PN.use_empty()) + return; + if (!IsOffsetKnown) + return PI.setAborted(&PN); + + // See if we already have computed info on this node. + std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; + if (PHIInfo.first) { + PHIInfo.second = true; + insertUse(PN, Offset, PHIInfo.first); + return; + } + + // Check for an unsafe use of the PHI node. + if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) + return PI.setAborted(UnsafeI); + + insertUse(PN, Offset, PHIInfo.first); + } + + void visitSelectInst(SelectInst &SI) { + if (SI.use_empty()) + return; + if (Value *Result = foldSelectInst(SI)) { + if (Result == *U) + // If the result of the constant fold will be the pointer, recurse + // through the select as if we had RAUW'ed it. + enqueueUsers(SI); + + return; + } + if (!IsOffsetKnown) + return PI.setAborted(&SI); + + // See if we already have computed info on this node. + std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; + if (SelectInfo.first) { + SelectInfo.second = true; + insertUse(SI, Offset, SelectInfo.first); + return; + } + + // Check for an unsafe use of the PHI node. + if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) + return PI.setAborted(UnsafeI); + + insertUse(SI, Offset, SelectInfo.first); + } + + /// \brief Disable SROA entirely if there are unhandled users of the alloca. + void visitInstruction(Instruction &I) { + PI.setAborted(&I); + } +}; + +/// \brief Use adder for the alloca partitioning. +/// +/// This class adds the uses of an alloca to all of the partitions which they +/// use. For splittable partitions, this can end up doing essentially a linear +/// walk of the partitions, but the number of steps remains bounded by the +/// total result instruction size: +/// - The number of partitions is a result of the number unsplittable +/// instructions using the alloca. +/// - The number of users of each partition is at worst the total number of +/// splittable instructions using the alloca. +/// Thus we will produce N * M instructions in the end, where N are the number +/// of unsplittable uses and M are the number of splittable. This visitor does +/// the exact same number of updates to the partitioning. +/// +/// In the more common case, this visitor will leverage the fact that the +/// partition space is pre-sorted, and do a logarithmic search for the +/// partition needed, making the total visit a classical ((N + M) * log(N)) +/// complexity operation. +class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> { + friend class PtrUseVisitor<UseBuilder>; + friend class InstVisitor<UseBuilder>; + typedef PtrUseVisitor<UseBuilder> Base; + + const uint64_t AllocSize; + AllocaPartitioning &P; + + /// \brief Set to de-duplicate dead instructions found in the use walk. + SmallPtrSet<Instruction *, 4> VisitedDeadInsts; + +public: + UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) + : PtrUseVisitor<UseBuilder>(TD), + AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), + P(P) {} + +private: + void markAsDead(Instruction &I) { + if (VisitedDeadInsts.insert(&I)) + P.DeadUsers.push_back(&I); + } + + void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) { + // If the use has a zero size or extends outside of the allocation, record + // it as a dead use for elimination later. + if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) + return markAsDead(User); + + uint64_t BeginOffset = Offset.getZExtValue(); + uint64_t EndOffset = BeginOffset + Size; + + // Clamp the end offset to the end of the allocation. Note that this is + // formulated to handle even the case where "BeginOffset + Size" overflows. + assert(AllocSize >= BeginOffset); // Established above. + if (Size > AllocSize - BeginOffset) + EndOffset = AllocSize; + + // NB: This only works if we have zero overlapping partitions. + iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); + if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) + B = llvm::prior(B); + for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; + ++I) { + PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), + std::min(I->EndOffset, EndOffset), U); + P.use_push_back(I, NewPU); + if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) + P.PHIOrSelectOpMap[U] + = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); + } + } + + void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset) { + uint64_t Size = DL.getTypeStoreSize(Ty); + + // If this memory access can be shown to *statically* extend outside the + // bounds of of the allocation, it's behavior is undefined, so simply + // ignore it. Note that this is more strict than the generic clamping + // behavior of insertUse. + if (Offset.isNegative() || Size > AllocSize || + Offset.ugt(AllocSize - Size)) + return markAsDead(I); + + insertUse(I, Offset, Size); + } + + void visitBitCastInst(BitCastInst &BC) { + if (BC.use_empty()) + return markAsDead(BC); + + return Base::visitBitCastInst(BC); + } + + void visitGetElementPtrInst(GetElementPtrInst &GEPI) { + if (GEPI.use_empty()) + return markAsDead(GEPI); + + return Base::visitGetElementPtrInst(GEPI); + } + + void visitLoadInst(LoadInst &LI) { + assert(IsOffsetKnown); + handleLoadOrStore(LI.getType(), LI, Offset); + } + + void visitStoreInst(StoreInst &SI) { + assert(IsOffsetKnown); + handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); + } + + void visitMemSetInst(MemSetInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + if ((Length && Length->getValue() == 0) || + (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + return markAsDead(II); + + assert(IsOffsetKnown); + insertUse(II, Offset, Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue()); + } + + void visitMemTransferInst(MemTransferInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + if ((Length && Length->getValue() == 0) || + (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) + return markAsDead(II); + + assert(IsOffsetKnown); + uint64_t Size = Length ? Length->getLimitedValue() + : AllocSize - Offset.getLimitedValue(); + + MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; + if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && + Offsets.DestBegin == Offsets.SourceBegin) + return markAsDead(II); // Skip identity transfers without side-effects. + + insertUse(II, Offset, Size); + } + + void visitIntrinsicInst(IntrinsicInst &II) { + assert(IsOffsetKnown); + assert(II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end); + + ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); + insertUse(II, Offset, std::min(Length->getLimitedValue(), + AllocSize - Offset.getLimitedValue())); + } + + void insertPHIOrSelect(Instruction &User, const APInt &Offset) { + uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; + + // For PHI and select operands outside the alloca, we can't nuke the entire + // phi or select -- the other side might still be relevant, so we special + // case them here and use a separate structure to track the operands + // themselves which should be replaced with undef. + if ((Offset.isNegative() && Offset.uge(Size)) || + (!Offset.isNegative() && Offset.uge(AllocSize))) { + P.DeadOperands.push_back(U); + return; + } + + insertUse(User, Offset, Size); + } + + void visitPHINode(PHINode &PN) { + if (PN.use_empty()) + return markAsDead(PN); + + assert(IsOffsetKnown); + insertPHIOrSelect(PN, Offset); + } + + void visitSelectInst(SelectInst &SI) { + if (SI.use_empty()) + return markAsDead(SI); + + if (Value *Result = foldSelectInst(SI)) { + if (Result == *U) + // If the result of the constant fold will be the pointer, recurse + // through the select as if we had RAUW'ed it. + enqueueUsers(SI); + else + // Otherwise the operand to the select is dead, and we can replace it + // with undef. + P.DeadOperands.push_back(U); + + return; + } + + assert(IsOffsetKnown); + insertPHIOrSelect(SI, Offset); + } + + /// \brief Unreachable, we've already visited the alloca once. + void visitInstruction(Instruction &I) { + llvm_unreachable("Unhandled instruction in use builder."); + } +}; + +void AllocaPartitioning::splitAndMergePartitions() { + size_t NumDeadPartitions = 0; + + // Track the range of splittable partitions that we pass when accumulating + // overlapping unsplittable partitions. + uint64_t SplitEndOffset = 0ull; + + Partition New(0ull, 0ull, false); + + for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { + ++j; + + if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { + assert(New.BeginOffset == New.EndOffset); + New = Partitions[i]; + } else { + assert(New.IsSplittable); + New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); + } + assert(New.BeginOffset != New.EndOffset); + + // Scan the overlapping partitions. + while (j != e && New.EndOffset > Partitions[j].BeginOffset) { + // If the new partition we are forming is splittable, stop at the first + // unsplittable partition. + if (New.IsSplittable && !Partitions[j].IsSplittable) + break; + + // Grow the new partition to include any equally splittable range. 'j' is + // always equally splittable when New is splittable, but when New is not + // splittable, we may subsume some (or part of some) splitable partition + // without growing the new one. + if (New.IsSplittable == Partitions[j].IsSplittable) { + New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); + } else { + assert(!New.IsSplittable); + assert(Partitions[j].IsSplittable); + SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); + } + + Partitions[j].kill(); + ++NumDeadPartitions; + ++j; + } + + // If the new partition is splittable, chop off the end as soon as the + // unsplittable subsequent partition starts and ensure we eventually cover + // the splittable area. + if (j != e && New.IsSplittable) { + SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); + New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); + } + + // Add the new partition if it differs from the original one and is + // non-empty. We can end up with an empty partition here if it was + // splittable but there is an unsplittable one that starts at the same + // offset. + if (New != Partitions[i]) { + if (New.BeginOffset != New.EndOffset) + Partitions.push_back(New); + // Mark the old one for removal. + Partitions[i].kill(); + ++NumDeadPartitions; + } + + New.BeginOffset = New.EndOffset; + if (!New.IsSplittable) { + New.EndOffset = std::max(New.EndOffset, SplitEndOffset); + if (j != e && !Partitions[j].IsSplittable) + New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); + New.IsSplittable = true; + // If there is a trailing splittable partition which won't be fused into + // the next splittable partition go ahead and add it onto the partitions + // list. + if (New.BeginOffset < New.EndOffset && + (j == e || !Partitions[j].IsSplittable || + New.EndOffset < Partitions[j].BeginOffset)) { + Partitions.push_back(New); + New.BeginOffset = New.EndOffset = 0ull; + } + } + } + + // Re-sort the partitions now that they have been split and merged into + // disjoint set of partitions. Also remove any of the dead partitions we've + // replaced in the process. + std::sort(Partitions.begin(), Partitions.end()); + if (NumDeadPartitions) { + assert(Partitions.back().isDead()); + assert((ptrdiff_t)NumDeadPartitions == + std::count(Partitions.begin(), Partitions.end(), Partitions.back())); + } + Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); +} + +AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) + : +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + AI(AI), +#endif + PointerEscapingInstr(0) { + PartitionBuilder PB(TD, AI, *this); + PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI); + if (PtrI.isEscaped() || PtrI.isAborted()) { + // FIXME: We should sink the escape vs. abort info into the caller nicely, + // possibly by just storing the PtrInfo in the AllocaPartitioning. + PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() + : PtrI.getAbortingInst(); + assert(PointerEscapingInstr && "Did not track a bad instruction"); + return; + } + + // Sort the uses. This arranges for the offsets to be in ascending order, + // and the sizes to be in descending order. + std::sort(Partitions.begin(), Partitions.end()); + + // Remove any partitions from the back which are marked as dead. + while (!Partitions.empty() && Partitions.back().isDead()) + Partitions.pop_back(); + + if (Partitions.size() > 1) { + // Intersect splittability for all partitions with equal offsets and sizes. + // Then remove all but the first so that we have a sequence of non-equal but + // potentially overlapping partitions. + for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; + I = J) { + ++J; + while (J != E && *I == *J) { + I->IsSplittable &= J->IsSplittable; + ++J; + } + } + Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), + Partitions.end()); + + // Split splittable and merge unsplittable partitions into a disjoint set + // of partitions over the used space of the allocation. + splitAndMergePartitions(); + } + + // Now build up the user lists for each of these disjoint partitions by + // re-walking the recursive users of the alloca. + Uses.resize(Partitions.size()); + UseBuilder UB(TD, AI, *this); + PtrI = UB.visitPtr(AI); + assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!"); + assert(!PtrI.isAborted() && "Early aborted the visit of the pointer."); +} + +Type *AllocaPartitioning::getCommonType(iterator I) const { + Type *Ty = 0; + for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { + if (!UI->U) + continue; // Skip dead uses. + if (isa<IntrinsicInst>(*UI->U->getUser())) + continue; + if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) + continue; + + Type *UserTy = 0; + if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) { + UserTy = LI->getType(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) { + UserTy = SI->getValueOperand()->getType(); + } else { + return 0; // Bail if we have weird uses. + } + + if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { + // If the type is larger than the partition, skip it. We only encounter + // this for split integer operations where we want to use the type of the + // entity causing the split. + if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8) + continue; + + // If we have found an integer type use covering the alloca, use that + // regardless of the other types, as integers are often used for a "bucket + // of bits" type. + return ITy; + } + + if (Ty && Ty != UserTy) + return 0; + + Ty = UserTy; + } + return Ty; +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + +void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + OS << Indent << "partition #" << (I - begin()) + << " [" << I->BeginOffset << "," << I->EndOffset << ")" + << (I->IsSplittable ? " (splittable)" : "") + << (Uses[I - begin()].empty() ? " (zero uses)" : "") + << "\n"; +} + +void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + for (const_use_iterator UI = use_begin(I), UE = use_end(I); + UI != UE; ++UI) { + if (!UI->U) + continue; // Skip dead uses. + OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " + << "used by: " << *UI->U->getUser() << "\n"; + if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) { + const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); + bool IsDest; + if (!MTO.IsSplittable) + IsDest = UI->BeginOffset == MTO.DestBegin; + else + IsDest = MTO.DestBegin != 0u; + OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " + << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) + << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; + } + } +} + +void AllocaPartitioning::print(raw_ostream &OS) const { + if (PointerEscapingInstr) { + OS << "No partitioning for alloca: " << AI << "\n" + << " A pointer to this alloca escaped by:\n" + << " " << *PointerEscapingInstr << "\n"; + return; + } + + OS << "Partitioning of alloca: " << AI << "\n"; + unsigned Num = 0; + for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) { + print(OS, I); + printUsers(OS, I); + } +} + +void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } +void AllocaPartitioning::dump() const { print(dbgs()); } + +#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + + +namespace { +/// \brief Implementation of LoadAndStorePromoter for promoting allocas. +/// +/// This subclass of LoadAndStorePromoter adds overrides to handle promoting +/// the loads and stores of an alloca instruction, as well as updating its +/// debug information. This is used when a domtree is unavailable and thus +/// mem2reg in its full form can't be used to handle promotion of allocas to +/// scalar values. +class AllocaPromoter : public LoadAndStorePromoter { + AllocaInst &AI; + DIBuilder &DIB; + + SmallVector<DbgDeclareInst *, 4> DDIs; + SmallVector<DbgValueInst *, 4> DVIs; + +public: + AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + AllocaInst &AI, DIBuilder &DIB) + : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} + + void run(const SmallVectorImpl<Instruction*> &Insts) { + // Remember which alloca we're promoting (for isInstInList). + if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { + for (Value::use_iterator UI = DebugNode->use_begin(), + UE = DebugNode->use_end(); + UI != UE; ++UI) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) + DVIs.push_back(DVI); + } + + LoadAndStorePromoter::run(Insts); + AI.eraseFromParent(); + while (!DDIs.empty()) + DDIs.pop_back_val()->eraseFromParent(); + while (!DVIs.empty()) + DVIs.pop_back_val()->eraseFromParent(); + } + + virtual bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->getOperand(0) == &AI; + return cast<StoreInst>(I)->getPointerOperand() == &AI; + } + + virtual void updateDebugInfo(Instruction *Inst) const { + for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + E = DDIs.end(); I != E; ++I) { + DbgDeclareInst *DDI = *I; + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + ConvertDebugDeclareToDebugValue(DDI, SI, DIB); + else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + ConvertDebugDeclareToDebugValue(DDI, LI, DIB); + } + for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + E = DVIs.end(); I != E; ++I) { + DbgValueInst *DVI = *I; + Value *Arg = NULL; + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + // If an argument is zero extended then use argument directly. The ZExt + // may be zapped by an optimization pass in future. + if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) + Arg = dyn_cast<Argument>(ZExt->getOperand(0)); + if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) + Arg = dyn_cast<Argument>(SExt->getOperand(0)); + if (!Arg) + Arg = SI->getOperand(0); + } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + Arg = LI->getOperand(0); + } else { + continue; + } + Instruction *DbgVal = + DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), + Inst); + DbgVal->setDebugLoc(DVI->getDebugLoc()); + } + } +}; +} // end anon namespace + + +namespace { +/// \brief An optimization pass providing Scalar Replacement of Aggregates. +/// +/// This pass takes allocations which can be completely analyzed (that is, they +/// don't escape) and tries to turn them into scalar SSA values. There are +/// a few steps to this process. +/// +/// 1) It takes allocations of aggregates and analyzes the ways in which they +/// are used to try to split them into smaller allocations, ideally of +/// a single scalar data type. It will split up memcpy and memset accesses +/// as necessary and try to isolate invidual scalar accesses. +/// 2) It will transform accesses into forms which are suitable for SSA value +/// promotion. This can be replacing a memset with a scalar store of an +/// integer value, or it can involve speculating operations on a PHI or +/// select to be a PHI or select of the results. +/// 3) Finally, this will try to detect a pattern of accesses which map cleanly +/// onto insert and extract operations on a vector value, and convert them to +/// this form. By doing so, it will enable promotion of vector aggregates to +/// SSA vector values. +class SROA : public FunctionPass { + const bool RequiresDomTree; + + LLVMContext *C; + const DataLayout *TD; + DominatorTree *DT; + + /// \brief Worklist of alloca instructions to simplify. + /// + /// Each alloca in the function is added to this. Each new alloca formed gets + /// added to it as well to recursively simplify unless that alloca can be + /// directly promoted. Finally, each time we rewrite a use of an alloca other + /// the one being actively rewritten, we add it back onto the list if not + /// already present to ensure it is re-visited. + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; + + /// \brief A collection of instructions to delete. + /// We try to batch deletions to simplify code and make things a bit more + /// efficient. + SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; + + /// \brief Post-promotion worklist. + /// + /// Sometimes we discover an alloca which has a high probability of becoming + /// viable for SROA after a round of promotion takes place. In those cases, + /// the alloca is enqueued here for re-processing. + /// + /// Note that we have to be very careful to clear allocas out of this list in + /// the event they are deleted. + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; + + /// \brief A collection of alloca instructions we can directly promote. + std::vector<AllocaInst *> PromotableAllocas; + +public: + SROA(bool RequiresDomTree = true) + : FunctionPass(ID), RequiresDomTree(RequiresDomTree), + C(0), TD(0), DT(0) { + initializeSROAPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F); + void getAnalysisUsage(AnalysisUsage &AU) const; + + const char *getPassName() const { return "SROA"; } + static char ID; + +private: + friend class PHIOrSelectSpeculator; + friend class AllocaPartitionRewriter; + friend class AllocaPartitionVectorRewriter; + + bool rewriteAllocaPartition(AllocaInst &AI, + AllocaPartitioning &P, + AllocaPartitioning::iterator PI); + bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); + bool runOnAlloca(AllocaInst &AI); + void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); + bool promoteAllocas(Function &F); +}; +} + +char SROA::ID = 0; + +FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { + return new SROA(RequiresDomTree); +} + +INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", + false, false) + +namespace { +/// \brief Visitor to speculate PHIs and Selects where possible. +class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<PHIOrSelectSpeculator>; + + const DataLayout &TD; + AllocaPartitioning &P; + SROA &Pass; + +public: + PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) + : TD(TD), P(P), Pass(Pass) {} + + /// \brief Visit the users of an alloca partition and rewrite them. + void visitUsers(AllocaPartitioning::const_iterator PI) { + // Note that we need to use an index here as the underlying vector of uses + // may be grown during speculation. However, we never need to re-visit the + // new uses, and so we can use the initial size bound. + for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { + const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx); + if (!PU.U) + continue; // Skip dead use. + + visit(cast<Instruction>(PU.U->getUser())); + } + } + +private: + // By default, skip this instruction. + void visitInstruction(Instruction &I) {} + + /// PHI instructions that use an alloca and are subsequently loaded can be + /// rewritten to load both input pointers in the pred blocks and then PHI the + /// results, allowing the load of the alloca to be promoted. + /// From this: + /// %P2 = phi [i32* %Alloca, i32* %Other] + /// %V = load i32* %P2 + /// to: + /// %V1 = load i32* %Alloca -> will be mem2reg'd + /// ... + /// %V2 = load i32* %Other + /// ... + /// %V = phi [i32 %V1, i32 %V2] + /// + /// We can do this to a select if its only uses are loads and if the operands + /// to the select can be loaded unconditionally. + /// + /// FIXME: This should be hoisted into a generic utility, likely in + /// Transforms/Util/Local.h + bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { + // For now, we can only do this promotion if the load is in the same block + // as the PHI, and if there are no stores between the phi and load. + // TODO: Allow recursive phi users. + // TODO: Allow stores. + BasicBlock *BB = PN.getParent(); + unsigned MaxAlign = 0; + for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) return false; + + // For now we only allow loads in the same block as the PHI. This is + // a common case that happens when instcombine merges two loads through + // a PHI. + if (LI->getParent() != BB) return false; + + // Ensure that there are no instructions between the PHI and the load that + // could store. + for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + MaxAlign = std::max(MaxAlign, LI->getAlignment()); + Loads.push_back(LI); + } + + // We can only transform this if it is safe to push the loads into the + // predecessor blocks. The only thing to watch out for is that we can't put + // a possibly trapping load in the predecessor if it is a critical edge. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; + ++Idx) { + TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); + Value *InVal = PN.getIncomingValue(Idx); + + // If the value is produced by the terminator of the predecessor (an + // invoke) or it has side-effects, there is no valid place to put a load + // in the predecessor. + if (TI == InVal || TI->mayHaveSideEffects()) + return false; + + // If the predecessor has a single successor, then the edge isn't + // critical. + if (TI->getNumSuccessors() == 1) + continue; + + // If this pointer is always safe to load, or if we can prove that there + // is already a load in the block, then we can move the load to the pred + // block. + if (InVal->isDereferenceablePointer() || + isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) + continue; + + return false; + } + + return true; + } + + void visitPHINode(PHINode &PN) { + DEBUG(dbgs() << " original: " << PN << "\n"); + + SmallVector<LoadInst *, 4> Loads; + if (!isSafePHIToSpeculate(PN, Loads)) + return; + + assert(!Loads.empty()); + + Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); + IRBuilder<> PHIBuilder(&PN); + PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), + PN.getName() + ".sroa.speculated"); + + // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // matter which one we get and if any differ, it doesn't matter. + LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); + MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + unsigned Align = SomeLoad->getAlignment(); + + // Rewrite all loads of the PN to use the new PHI. + do { + LoadInst *LI = Loads.pop_back_val(); + LI->replaceAllUsesWith(NewPN); + Pass.DeadInsts.insert(LI); + } while (!Loads.empty()); + + // Inject loads into all of the pred blocks. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { + BasicBlock *Pred = PN.getIncomingBlock(Idx); + TerminatorInst *TI = Pred->getTerminator(); + Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); + Value *InVal = PN.getIncomingValue(Idx); + IRBuilder<> PredBuilder(TI); + + LoadInst *Load + = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + + Pred->getName())); + ++NumLoadsSpeculated; + Load->setAlignment(Align); + if (TBAATag) + Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + NewPN->addIncoming(Load, Pred); + + Instruction *Ptr = dyn_cast<Instruction>(InVal); + if (!Ptr) + // No uses to rewrite. + continue; + + // Try to lookup and rewrite any partition uses corresponding to this phi + // input. + AllocaPartitioning::iterator PI + = P.findPartitionForPHIOrSelectOperand(InUse); + if (PI == P.end()) + continue; + + // Replace the Use in the PartitionUse for this operand with the Use + // inside the load. + AllocaPartitioning::use_iterator UI + = P.findPartitionUseForPHIOrSelectOperand(InUse); + assert(isa<PHINode>(*UI->U->getUser())); + UI->U = &Load->getOperandUse(Load->getPointerOperandIndex()); + } + DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); + } + + /// Select instructions that use an alloca and are subsequently loaded can be + /// rewritten to load both input pointers and then select between the result, + /// allowing the load of the alloca to be promoted. + /// From this: + /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other + /// %V = load i32* %P2 + /// to: + /// %V1 = load i32* %Alloca -> will be mem2reg'd + /// %V2 = load i32* %Other + /// %V = select i1 %cond, i32 %V1, i32 %V2 + /// + /// We can do this to a select if its only uses are loads and if the operand + /// to the select can be loaded unconditionally. + bool isSafeSelectToSpeculate(SelectInst &SI, + SmallVectorImpl<LoadInst *> &Loads) { + Value *TValue = SI.getTrueValue(); + Value *FValue = SI.getFalseValue(); + bool TDerefable = TValue->isDereferenceablePointer(); + bool FDerefable = FValue->isDereferenceablePointer(); + + for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) return false; + + // Both operands to the select need to be dereferencable, either + // absolutely (e.g. allocas) or at this point because we can see other + // accesses to it. + if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, + LI->getAlignment(), &TD)) + return false; + if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, + LI->getAlignment(), &TD)) + return false; + Loads.push_back(LI); + } + + return true; + } + + void visitSelectInst(SelectInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + IRBuilder<> IRB(&SI); + + // If the select isn't safe to speculate, just use simple logic to emit it. + SmallVector<LoadInst *, 4> Loads; + if (!isSafeSelectToSpeculate(SI, Loads)) + return; + + Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; + AllocaPartitioning::iterator PIs[2]; + AllocaPartitioning::PartitionUse PUs[2]; + for (unsigned i = 0, e = 2; i != e; ++i) { + PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); + if (PIs[i] != P.end()) { + // If the pointer is within the partitioning, remove the select from + // its uses. We'll add in the new loads below. + AllocaPartitioning::use_iterator UI + = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); + PUs[i] = *UI; + // Clear out the use here so that the offsets into the use list remain + // stable but this use is ignored when rewriting. + UI->U = 0; + } + } + + Value *TV = SI.getTrueValue(); + Value *FV = SI.getFalseValue(); + // Replace the loads of the select with a select of two loads. + while (!Loads.empty()) { + LoadInst *LI = Loads.pop_back_val(); + + IRB.SetInsertPoint(LI); + LoadInst *TL = + IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); + LoadInst *FL = + IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); + NumLoadsSpeculated += 2; + + // Transfer alignment and TBAA info if present. + TL->setAlignment(LI->getAlignment()); + FL->setAlignment(LI->getAlignment()); + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { + TL->setMetadata(LLVMContext::MD_tbaa, Tag); + FL->setMetadata(LLVMContext::MD_tbaa, Tag); + } + + Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, + LI->getName() + ".sroa.speculated"); + + LoadInst *Loads[2] = { TL, FL }; + for (unsigned i = 0, e = 2; i != e; ++i) { + if (PIs[i] != P.end()) { + Use *LoadUse = &Loads[i]->getOperandUse(0); + assert(PUs[i].U->get() == LoadUse->get()); + PUs[i].U = LoadUse; + P.use_push_back(PIs[i], PUs[i]); + } + } + + DEBUG(dbgs() << " speculated to: " << *V << "\n"); + LI->replaceAllUsesWith(V); + Pass.DeadInsts.insert(LI); + } + } +}; +} + +/// \brief Build a GEP out of a base pointer and indices. +/// +/// This will return the BasePtr if that is valid, or build a new GEP +/// instruction using the IRBuilder if GEP-ing is needed. +static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + if (Indices.empty()) + return BasePtr; + + // A single zero index is a no-op, so check for this and avoid building a GEP + // in that case. + if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) + return BasePtr; + + return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); +} + +/// \brief Get a natural GEP off of the BasePtr walking through Ty toward +/// TargetTy without changing the offset of the pointer. +/// +/// This routine assumes we've already established a properly offset GEP with +/// Indices, and arrived at the Ty type. The goal is to continue to GEP with +/// zero-indices down through type layers until we find one the same as +/// TargetTy. If we can't find one with the same type, we at least try to use +/// one with the same size. If none of that works, we just produce the GEP as +/// indicated by Indices to have the correct offset. +static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, + Value *BasePtr, Type *Ty, Type *TargetTy, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + if (Ty == TargetTy) + return buildGEP(IRB, BasePtr, Indices, Prefix); + + // See if we can descend into a struct and locate a field with the correct + // type. + unsigned NumLayers = 0; + Type *ElementTy = Ty; + do { + if (ElementTy->isPointerTy()) + break; + if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { + ElementTy = SeqTy->getElementType(); + // Note that we use the default address space as this index is over an + // array or a vector, not a pointer. + Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); + } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { + if (STy->element_begin() == STy->element_end()) + break; // Nothing left to descend into. + ElementTy = *STy->element_begin(); + Indices.push_back(IRB.getInt32(0)); + } else { + break; + } + ++NumLayers; + } while (ElementTy != TargetTy); + if (ElementTy != TargetTy) + Indices.erase(Indices.end() - NumLayers, Indices.end()); + + return buildGEP(IRB, BasePtr, Indices, Prefix); +} + +/// \brief Recursively compute indices for a natural GEP. +/// +/// This is the recursive step for getNaturalGEPWithOffset that walks down the +/// element types adding appropriate indices for the GEP. +static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, Type *Ty, APInt &Offset, + Type *TargetTy, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + if (Offset == 0) + return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); + + // We can't recurse through pointer types. + if (Ty->isPointerTy()) + return 0; + + // We try to analyze GEPs over vectors here, but note that these GEPs are + // extremely poorly defined currently. The long-term goal is to remove GEPing + // over a vector from the IR completely. + if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { + unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType()); + if (ElementSizeInBits % 8) + return 0; // GEPs over non-multiple of 8 size vector elements are invalid. + APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); + APInt NumSkippedElements = Offset.sdiv(ElementSize); + if (NumSkippedElements.ugt(VecTy->getNumElements())) + return 0; + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), + Offset, TargetTy, Indices, Prefix); + } + + if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { + Type *ElementTy = ArrTy->getElementType(); + APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + APInt NumSkippedElements = Offset.sdiv(ElementSize); + if (NumSkippedElements.ugt(ArrTy->getNumElements())) + return 0; + + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + Indices, Prefix); + } + + StructType *STy = dyn_cast<StructType>(Ty); + if (!STy) + return 0; + + const StructLayout *SL = TD.getStructLayout(STy); + uint64_t StructOffset = Offset.getZExtValue(); + if (StructOffset >= SL->getSizeInBytes()) + return 0; + unsigned Index = SL->getElementContainingOffset(StructOffset); + Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); + Type *ElementTy = STy->getElementType(Index); + if (Offset.uge(TD.getTypeAllocSize(ElementTy))) + return 0; // The offset points into alignment padding. + + Indices.push_back(IRB.getInt32(Index)); + return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + Indices, Prefix); +} + +/// \brief Get a natural GEP from a base pointer to a particular offset and +/// resulting in a particular type. +/// +/// The goal is to produce a "natural" looking GEP that works with the existing +/// composite types to arrive at the appropriate offset and element type for +/// a pointer. TargetTy is the element type the returned GEP should point-to if +/// possible. We recurse by decreasing Offset, adding the appropriate index to +/// Indices, and setting Ty to the result subtype. +/// +/// If no natural GEP can be constructed, this function returns null. +static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, APInt Offset, Type *TargetTy, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + PointerType *Ty = cast<PointerType>(Ptr->getType()); + + // Don't consider any GEPs through an i8* as natural unless the TargetTy is + // an i8. + if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) + return 0; + + Type *ElementTy = Ty->getElementType(); + if (!ElementTy->isSized()) + return 0; // We can't GEP through an unsized element. + APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + if (ElementSize == 0) + return 0; // Zero-length arrays can't help us build a natural GEP. + APInt NumSkippedElements = Offset.sdiv(ElementSize); + + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + Indices, Prefix); +} + +/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the +/// resulting pointer has PointerTy. +/// +/// This tries very hard to compute a "natural" GEP which arrives at the offset +/// and produces the pointer type desired. Where it cannot, it will try to use +/// the natural GEP to arrive at the offset and bitcast to the type. Where that +/// fails, it will try to use an existing i8* and GEP to the byte offset and +/// bitcast to the type. +/// +/// The strategy for finding the more natural GEPs is to peel off layers of the +/// pointer, walking back through bit casts and GEPs, searching for a base +/// pointer from which we can compute a natural GEP with the desired +/// properities. The algorithm tries to fold as many constant indices into +/// a single GEP as possible, thus making each GEP more independent of the +/// surrounding code. +static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, APInt Offset, Type *PointerTy, + const Twine &Prefix) { + // Even though we don't look through PHI nodes, we could be called on an + // instruction in an unreachable block, which may be on a cycle. + SmallPtrSet<Value *, 4> Visited; + Visited.insert(Ptr); + SmallVector<Value *, 4> Indices; + + // We may end up computing an offset pointer that has the wrong type. If we + // never are able to compute one directly that has the correct type, we'll + // fall back to it, so keep it around here. + Value *OffsetPtr = 0; + + // Remember any i8 pointer we come across to re-use if we need to do a raw + // byte offset. + Value *Int8Ptr = 0; + APInt Int8PtrOffset(Offset.getBitWidth(), 0); + + Type *TargetTy = PointerTy->getPointerElementType(); + + do { + // First fold any existing GEPs into the offset. + while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { + APInt GEPOffset(Offset.getBitWidth(), 0); + if (!GEP->accumulateConstantOffset(TD, GEPOffset)) + break; + Offset += GEPOffset; + Ptr = GEP->getPointerOperand(); + if (!Visited.insert(Ptr)) + break; + } + + // See if we can perform a natural GEP here. + Indices.clear(); + if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, + Indices, Prefix)) { + if (P->getType() == PointerTy) { + // Zap any offset pointer that we ended up computing in previous rounds. + if (OffsetPtr && OffsetPtr->use_empty()) + if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) + I->eraseFromParent(); + return P; + } + if (!OffsetPtr) { + OffsetPtr = P; + } + } + + // Stash this pointer if we've found an i8*. + if (Ptr->getType()->isIntegerTy(8)) { + Int8Ptr = Ptr; + Int8PtrOffset = Offset; + } + + // Peel off a layer of the pointer and update the offset appropriately. + if (Operator::getOpcode(Ptr) == Instruction::BitCast) { + Ptr = cast<Operator>(Ptr)->getOperand(0); + } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { + if (GA->mayBeOverridden()) + break; + Ptr = GA->getAliasee(); + } else { + break; + } + assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); + } while (Visited.insert(Ptr)); + + if (!OffsetPtr) { + if (!Int8Ptr) { + Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), + Prefix + ".raw_cast"); + Int8PtrOffset = Offset; + } + + OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : + IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), + Prefix + ".raw_idx"); + } + Ptr = OffsetPtr; + + // On the off chance we were targeting i8*, guard the bitcast here. + if (Ptr->getType() != PointerTy) + Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); + + return Ptr; +} + +/// \brief Test whether we can convert a value from the old to the new type. +/// +/// This predicate should be used to guard calls to convertValue in order to +/// ensure that we only try to convert viable values. The strategy is that we +/// will peel off single element struct and array wrappings to get to an +/// underlying value, and convert that value. +static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { + if (OldTy == NewTy) + return true; + if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) + return false; + if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) + return false; + + if (NewTy->isPointerTy() || OldTy->isPointerTy()) { + if (NewTy->isPointerTy() && OldTy->isPointerTy()) + return true; + if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) + return true; + return false; + } + + return true; +} + +/// \brief Generic routine to convert an SSA value to a value of a different +/// type. +/// +/// This will try various different casting techniques, such as bitcasts, +/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test +/// two types for viability with this routine. +static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V, + Type *Ty) { + assert(canConvertValue(DL, V->getType(), Ty) && + "Value not convertable to type"); + if (V->getType() == Ty) + return V; + if (V->getType()->isIntegerTy() && Ty->isPointerTy()) + return IRB.CreateIntToPtr(V, Ty); + if (V->getType()->isPointerTy() && Ty->isIntegerTy()) + return IRB.CreatePtrToInt(V, Ty); + + return IRB.CreateBitCast(V, Ty); +} + +/// \brief Test whether the given alloca partition can be promoted to a vector. +/// +/// This is a quick test to check whether we can rewrite a particular alloca +/// partition (and its newly formed alloca) into a vector alloca with only +/// whole-vector loads and stores such that it could be promoted to a vector +/// SSA value. We only can ensure this for a limited set of operations, and we +/// don't want to do the rewrites unless we are confident that the result will +/// be promotable, so we have an early test here. +static bool isVectorPromotionViable(const DataLayout &TD, + Type *AllocaTy, + AllocaPartitioning &P, + uint64_t PartitionBeginOffset, + uint64_t PartitionEndOffset, + AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + VectorType *Ty = dyn_cast<VectorType>(AllocaTy); + if (!Ty) + return false; + + uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType()); + + // While the definition of LLVM vectors is bitpacked, we don't support sizes + // that aren't byte sized. + if (ElementSize % 8) + return false; + assert((TD.getTypeSizeInBits(Ty) % 8) == 0 && + "vector size not a multiple of element size?"); + ElementSize /= 8; + + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead use. + + uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; + uint64_t BeginIndex = BeginOffset / ElementSize; + if (BeginIndex * ElementSize != BeginOffset || + BeginIndex >= Ty->getNumElements()) + return false; + uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; + uint64_t EndIndex = EndOffset / ElementSize; + if (EndIndex * ElementSize != EndOffset || + EndIndex > Ty->getNumElements()) + return false; + + assert(EndIndex > BeginIndex && "Empty vector!"); + uint64_t NumElements = EndIndex - BeginIndex; + Type *PartitionTy + = (NumElements == 1) ? Ty->getElementType() + : VectorType::get(Ty->getElementType(), NumElements); + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { + if (MI->isVolatile()) + return false; + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(*MTI); + if (!MTO.IsSplittable) + return false; + } + } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) { + // Disable vector promotion when there are loads or stores of an FCA. + return false; + } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { + if (LI->isVolatile()) + return false; + if (!canConvertValue(TD, PartitionTy, LI->getType())) + return false; + } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { + if (SI->isVolatile()) + return false; + if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy)) + return false; + } else { + return false; + } + } + return true; +} + +/// \brief Test whether the given alloca partition's integer operations can be +/// widened to promotable ones. +/// +/// This is a quick test to check whether we can rewrite the integer loads and +/// stores to a particular alloca into wider loads and stores and be able to +/// promote the resulting alloca. +static bool isIntegerWideningViable(const DataLayout &TD, + Type *AllocaTy, + uint64_t AllocBeginOffset, + AllocaPartitioning &P, + AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); + // Don't create integer types larger than the maximum bitwidth. + if (SizeInBits > IntegerType::MAX_INT_BITS) + return false; + + // Don't try to handle allocas with bit-padding. + if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) + return false; + + // We need to ensure that an integer type with the appropriate bitwidth can + // be converted to the alloca type, whatever that is. We don't want to force + // the alloca itself to have an integer type if there is a more suitable one. + Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); + if (!canConvertValue(TD, AllocaTy, IntTy) || + !canConvertValue(TD, IntTy, AllocaTy)) + return false; + + uint64_t Size = TD.getTypeStoreSize(AllocaTy); + + // Check the uses to ensure the uses are (likely) promoteable integer uses. + // Also ensure that the alloca has a covering load or store. We don't want + // to widen the integer operotains only to fail to promote due to some other + // unsplittable entry (which we may make splittable later). + bool WholeAllocaOp = false; + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead use. + + uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; + uint64_t RelEnd = I->EndOffset - AllocBeginOffset; + + // We can't reasonably handle cases where the load or store extends past + // the end of the aloca's type and into its padding. + if (RelEnd > Size) + return false; + + if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { + if (LI->isVolatile()) + return false; + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { + if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) + return false; + continue; + } + // Non-integer loads need to be convertible from the alloca type so that + // they are promotable. + if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(TD, AllocaTy, LI->getType())) + return false; + } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { + Type *ValueTy = SI->getValueOperand()->getType(); + if (SI->isVolatile()) + return false; + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { + if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) + return false; + continue; + } + // Non-integer stores need to be convertible to the alloca type so that + // they are promotable. + if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(TD, ValueTy, AllocaTy)) + return false; + } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { + if (MI->isVolatile() || !isa<Constant>(MI->getLength())) + return false; + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(*MTI); + if (!MTO.IsSplittable) + return false; + } + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return false; + } else { + return false; + } + } + return WholeAllocaOp; +} + +static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V, + IntegerType *Ty, uint64_t Offset, + const Twine &Name) { + DEBUG(dbgs() << " start: " << *V << "\n"); + IntegerType *IntTy = cast<IntegerType>(V->getType()); + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element extends past full value"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) { + V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); + DEBUG(dbgs() << " shifted: " << *V << "\n"); + } + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot extract to a larger integer!"); + if (Ty != IntTy) { + V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); + DEBUG(dbgs() << " trunced: " << *V << "\n"); + } + return V; +} + +static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old, + Value *V, uint64_t Offset, const Twine &Name) { + IntegerType *IntTy = cast<IntegerType>(Old->getType()); + IntegerType *Ty = cast<IntegerType>(V->getType()); + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot insert a larger integer!"); + DEBUG(dbgs() << " start: " << *V << "\n"); + if (Ty != IntTy) { + V = IRB.CreateZExt(V, IntTy, Name + ".ext"); + DEBUG(dbgs() << " extended: " << *V << "\n"); + } + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element store outside of alloca store"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) { + V = IRB.CreateShl(V, ShAmt, Name + ".shift"); + DEBUG(dbgs() << " shifted: " << *V << "\n"); + } + + if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { + APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); + Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); + DEBUG(dbgs() << " masked: " << *Old << "\n"); + V = IRB.CreateOr(Old, V, Name + ".insert"); + DEBUG(dbgs() << " inserted: " << *V << "\n"); + } + return V; +} + +static Value *extractVector(IRBuilder<> &IRB, Value *V, + unsigned BeginIndex, unsigned EndIndex, + const Twine &Name) { + VectorType *VecTy = cast<VectorType>(V->getType()); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + + if (NumElements == VecTy->getNumElements()) + return V; + + if (NumElements == 1) { + V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), + Name + ".extract"); + DEBUG(dbgs() << " extract: " << *V << "\n"); + return V; + } + + SmallVector<Constant*, 8> Mask; + Mask.reserve(NumElements); + for (unsigned i = BeginIndex; i != EndIndex; ++i) + Mask.push_back(IRB.getInt32(i)); + V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(Mask), + Name + ".extract"); + DEBUG(dbgs() << " shuffle: " << *V << "\n"); + return V; +} + +static Value *insertVector(IRBuilder<> &IRB, Value *Old, Value *V, + unsigned BeginIndex, const Twine &Name) { + VectorType *VecTy = cast<VectorType>(Old->getType()); + assert(VecTy && "Can only insert a vector into a vector"); + + VectorType *Ty = dyn_cast<VectorType>(V->getType()); + if (!Ty) { + // Single element to insert. + V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), + Name + ".insert"); + DEBUG(dbgs() << " insert: " << *V << "\n"); + return V; + } + + assert(Ty->getNumElements() <= VecTy->getNumElements() && + "Too many elements!"); + if (Ty->getNumElements() == VecTy->getNumElements()) { + assert(V->getType() == VecTy && "Vector type mismatch"); + return V; + } + unsigned EndIndex = BeginIndex + Ty->getNumElements(); + + // When inserting a smaller vector into the larger to store, we first + // use a shuffle vector to widen it with undef elements, and then + // a second shuffle vector to select between the loaded vector and the + // incoming vector. + SmallVector<Constant*, 8> Mask; + Mask.reserve(VecTy->getNumElements()); + for (unsigned i = 0; i != VecTy->getNumElements(); ++i) + if (i >= BeginIndex && i < EndIndex) + Mask.push_back(IRB.getInt32(i - BeginIndex)); + else + Mask.push_back(UndefValue::get(IRB.getInt32Ty())); + V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(Mask), + Name + ".expand"); + DEBUG(dbgs() << " shuffle1: " << *V << "\n"); + + Mask.clear(); + for (unsigned i = 0; i != VecTy->getNumElements(); ++i) + if (i >= BeginIndex && i < EndIndex) + Mask.push_back(IRB.getInt32(i)); + else + Mask.push_back(IRB.getInt32(i + VecTy->getNumElements())); + V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask), + Name + "insert"); + DEBUG(dbgs() << " shuffle2: " << *V << "\n"); + return V; +} + +namespace { +/// \brief Visitor to rewrite instructions using a partition of an alloca to +/// use a new alloca. +/// +/// Also implements the rewriting to vector-based accesses when the partition +/// passes the isVectorPromotionViable predicate. Most of the rewriting logic +/// lives here. +class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, + bool> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; + + const DataLayout &TD; + AllocaPartitioning &P; + SROA &Pass; + AllocaInst &OldAI, &NewAI; + const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; + Type *NewAllocaTy; + + // If we are rewriting an alloca partition which can be written as pure + // vector operations, we stash extra information here. When VecTy is + // non-null, we have some strict guarantees about the rewriten alloca: + // - The new alloca is exactly the size of the vector type here. + // - The accesses all either map to the entire vector or to a single + // element. + // - The set of accessing instructions is only one of those handled above + // in isVectorPromotionViable. Generally these are the same access kinds + // which are promotable via mem2reg. + VectorType *VecTy; + Type *ElementTy; + uint64_t ElementSize; + + // This is a convenience and flag variable that will be null unless the new + // alloca's integer operations should be widened to this integer type due to + // passing isIntegerWideningViable above. If it is non-null, the desired + // integer type will be stored here for easy access during rewriting. + IntegerType *IntTy; + + // The offset of the partition user currently being rewritten. + uint64_t BeginOffset, EndOffset; + Use *OldUse; + Instruction *OldPtr; + + // The name prefix to use when rewriting instructions for this alloca. + std::string NamePrefix; + +public: + AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, + AllocaPartitioning::iterator PI, + SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, + uint64_t NewBeginOffset, uint64_t NewEndOffset) + : TD(TD), P(P), Pass(Pass), + OldAI(OldAI), NewAI(NewAI), + NewAllocaBeginOffset(NewBeginOffset), + NewAllocaEndOffset(NewEndOffset), + NewAllocaTy(NewAI.getAllocatedType()), + VecTy(), ElementTy(), ElementSize(), IntTy(), + BeginOffset(), EndOffset() { + } + + /// \brief Visit the users of the alloca partition and rewrite them. + bool visitUsers(AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, + NewAllocaBeginOffset, NewAllocaEndOffset, + I, E)) { + ++NumVectorized; + VecTy = cast<VectorType>(NewAI.getAllocatedType()); + ElementTy = VecTy->getElementType(); + assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 && + "Only multiple-of-8 sized vector elements are viable"); + ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8; + } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), + NewAllocaBeginOffset, P, I, E)) { + IntTy = Type::getIntNTy(NewAI.getContext(), + TD.getTypeSizeInBits(NewAI.getAllocatedType())); + } + bool CanSROA = true; + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead uses. + BeginOffset = I->BeginOffset; + EndOffset = I->EndOffset; + OldUse = I->U; + OldPtr = cast<Instruction>(I->U->get()); + NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); + CanSROA &= visit(cast<Instruction>(I->U->getUser())); + } + if (VecTy) { + assert(CanSROA); + VecTy = 0; + ElementTy = 0; + ElementSize = 0; + } + if (IntTy) { + assert(CanSROA); + IntTy = 0; + } + return CanSROA; + } + +private: + // Every instruction which can end up as a user must have a rewrite rule. + bool visitInstruction(Instruction &I) { + DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); + llvm_unreachable("No rewrite rule for this instruction!"); + } + + Twine getName(const Twine &Suffix) { + return NamePrefix + Suffix; + } + + Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { + assert(BeginOffset >= NewAllocaBeginOffset); + APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); + return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); + } + + /// \brief Compute suitable alignment to access an offset into the new alloca. + unsigned getOffsetAlign(uint64_t Offset) { + unsigned NewAIAlign = NewAI.getAlignment(); + if (!NewAIAlign) + NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); + return MinAlign(NewAIAlign, Offset); + } + + /// \brief Compute suitable alignment to access this partition of the new + /// alloca. + unsigned getPartitionAlign() { + return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); + } + + /// \brief Compute suitable alignment to access a type at an offset of the + /// new alloca. + /// + /// \returns zero if the type's ABI alignment is a suitable alignment, + /// otherwise returns the maximal suitable alignment. + unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { + unsigned Align = getOffsetAlign(Offset); + return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; + } + + /// \brief Compute suitable alignment to access a type at the beginning of + /// this partition of the new alloca. + /// + /// See \c getOffsetTypeAlign for details; this routine delegates to it. + unsigned getPartitionTypeAlign(Type *Ty) { + return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); + } + + unsigned getIndex(uint64_t Offset) { + assert(VecTy && "Can only call getIndex when rewriting a vector"); + uint64_t RelOffset = Offset - NewAllocaBeginOffset; + assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); + uint32_t Index = RelOffset / ElementSize; + assert(Index * ElementSize == RelOffset); + return Index; + } + + void deleteIfTriviallyDead(Value *V) { + Instruction *I = cast<Instruction>(V); + if (isInstructionTriviallyDead(I)) + Pass.DeadInsts.insert(I); + } + + Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB) { + unsigned BeginIndex = getIndex(BeginOffset); + unsigned EndIndex = getIndex(EndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + return extractVector(IRB, V, BeginIndex, EndIndex, getName(".vec")); + } + + Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { + assert(IntTy && "We cannot insert an integer to the alloca"); + assert(!LI.isVolatile()); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + V = convertValue(TD, IRB, V, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + if (Offset > 0 || EndOffset < NewAllocaEndOffset) + V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, + getName(".extract")); + return V; + } + + bool visitLoadInst(LoadInst &LI) { + DEBUG(dbgs() << " original: " << LI << "\n"); + Value *OldOp = LI.getOperand(0); + assert(OldOp == OldPtr); + IRBuilder<> IRB(&LI); + + uint64_t Size = EndOffset - BeginOffset; + bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType()); + + // If this memory access can be shown to *statically* extend outside the + // bounds of the original allocation it's behavior is undefined. Rather + // than trying to transform it, just replace it with undef. + // FIXME: We should do something more clever for functions being + // instrumented by asan. + // FIXME: Eventually, once ASan and friends can flush out bugs here, this + // should be transformed to a load of null making it unreachable. + uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType()); + if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) { + LI.replaceAllUsesWith(UndefValue::get(LI.getType())); + Pass.DeadInsts.insert(&LI); + deleteIfTriviallyDead(OldOp); + DEBUG(dbgs() << " to: undef!!\n"); + return true; + } + + Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8) + : LI.getType(); + bool IsPtrAdjusted = false; + Value *V; + if (VecTy) { + V = rewriteVectorizedLoadInst(IRB); + } else if (IntTy && LI.getType()->isIntegerTy()) { + V = rewriteIntegerLoad(IRB, LI); + } else if (BeginOffset == NewAllocaBeginOffset && + canConvertValue(TD, NewAllocaTy, LI.getType())) { + V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + LI.isVolatile(), getName(".load")); + } else { + Type *LTy = TargetTy->getPointerTo(); + V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy), + getPartitionTypeAlign(TargetTy), + LI.isVolatile(), getName(".load")); + IsPtrAdjusted = true; + } + V = convertValue(TD, IRB, V, TargetTy); + + if (IsSplitIntLoad) { + assert(!LI.isVolatile()); + assert(LI.getType()->isIntegerTy() && + "Only integer type loads and stores are split"); + assert(LI.getType()->getIntegerBitWidth() == + TD.getTypeStoreSizeInBits(LI.getType()) && + "Non-byte-multiple bit width"); + assert(LI.getType()->getIntegerBitWidth() == + TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && + "Only alloca-wide loads can be split and recomposed"); + // Move the insertion point just past the load so that we can refer to it. + IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); + // Create a placeholder value with the same type as LI to use as the + // basis for the new value. This allows us to replace the uses of LI with + // the computed value, and then replace the placeholder with LI, leaving + // LI only used for this computation. + Value *Placeholder + = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); + V = insertInteger(TD, IRB, Placeholder, V, BeginOffset, + getName(".insert")); + LI.replaceAllUsesWith(V); + Placeholder->replaceAllUsesWith(&LI); + delete Placeholder; + } else { + LI.replaceAllUsesWith(V); + } + + Pass.DeadInsts.insert(&LI); + deleteIfTriviallyDead(OldOp); + DEBUG(dbgs() << " to: " << *V << "\n"); + return !LI.isVolatile() && !IsPtrAdjusted; + } + + bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V, + StoreInst &SI, Value *OldOp) { + unsigned BeginIndex = getIndex(BeginOffset); + unsigned EndIndex = getIndex(EndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + Type *PartitionTy + = (NumElements == 1) ? ElementTy + : VectorType::get(ElementTy, NumElements); + if (V->getType() != PartitionTy) + V = convertValue(TD, IRB, V, PartitionTy); + + // Mix in the existing elements. + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + V = insertVector(IRB, Old, V, BeginIndex, getName(".vec")); + + StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); + Pass.DeadInsts.insert(&SI); + + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return true; + } + + bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) { + assert(IntTy && "We cannot extract an integer from the alloca"); + assert(!SI.isVolatile()); + if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, + getName(".insert")); + } + V = convertValue(TD, IRB, V, NewAllocaTy); + StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); + Pass.DeadInsts.insert(&SI); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return true; + } + + bool visitStoreInst(StoreInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + Value *OldOp = SI.getOperand(1); + assert(OldOp == OldPtr); + IRBuilder<> IRB(&SI); + + Value *V = SI.getValueOperand(); + + // Strip all inbounds GEPs and pointer casts to try to dig out any root + // alloca that should be re-examined after promoting this alloca. + if (V->getType()->isPointerTy()) + if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) + Pass.PostPromotionWorklist.insert(AI); + + uint64_t Size = EndOffset - BeginOffset; + if (Size < TD.getTypeStoreSize(V->getType())) { + assert(!SI.isVolatile()); + assert(V->getType()->isIntegerTy() && + "Only integer type loads and stores are split"); + assert(V->getType()->getIntegerBitWidth() == + TD.getTypeStoreSizeInBits(V->getType()) && + "Non-byte-multiple bit width"); + assert(V->getType()->getIntegerBitWidth() == + TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && + "Only alloca-wide stores can be split and recomposed"); + IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); + V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset, + getName(".extract")); + } + + if (VecTy) + return rewriteVectorizedStoreInst(IRB, V, SI, OldOp); + if (IntTy && V->getType()->isIntegerTy()) + return rewriteIntegerStore(IRB, V, SI); + + StoreInst *NewSI; + if (BeginOffset == NewAllocaBeginOffset && + canConvertValue(TD, V->getType(), NewAllocaTy)) { + V = convertValue(TD, IRB, V, NewAllocaTy); + NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), + SI.isVolatile()); + } else { + Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo()); + NewSI = IRB.CreateAlignedStore(V, NewPtr, + getPartitionTypeAlign(V->getType()), + SI.isVolatile()); + } + (void)NewSI; + Pass.DeadInsts.insert(&SI); + deleteIfTriviallyDead(OldOp); + + DEBUG(dbgs() << " to: " << *NewSI << "\n"); + return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); + } + + /// \brief Compute an integer value from splatting an i8 across the given + /// number of bytes. + /// + /// Note that this routine assumes an i8 is a byte. If that isn't true, don't + /// call this routine. + /// FIXME: Heed the abvice above. + /// + /// \param V The i8 value to splat. + /// \param Size The number of bytes in the output (assuming i8 is one byte) + Value *getIntegerSplat(IRBuilder<> &IRB, Value *V, unsigned Size) { + assert(Size > 0 && "Expected a positive number of bytes."); + IntegerType *VTy = cast<IntegerType>(V->getType()); + assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); + if (Size == 1) + return V; + + Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); + V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")), + ConstantExpr::getUDiv( + Constant::getAllOnesValue(SplatIntTy), + ConstantExpr::getZExt( + Constant::getAllOnesValue(V->getType()), + SplatIntTy)), + getName(".isplat")); + return V; + } + + /// \brief Compute a vector splat for a given element value. + Value *getVectorSplat(IRBuilder<> &IRB, Value *V, unsigned NumElements) { + V = IRB.CreateVectorSplat(NumElements, V, NamePrefix); + DEBUG(dbgs() << " splat: " << *V << "\n"); + return V; + } + + bool visitMemSetInst(MemSetInst &II) { + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + assert(II.getRawDest() == OldPtr); + + // If the memset has a variable size, it cannot be split, just adjust the + // pointer to the new alloca. + if (!isa<Constant>(II.getLength())) { + II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); + + deleteIfTriviallyDead(OldPtr); + return false; + } + + // Record this instruction for deletion. + Pass.DeadInsts.insert(&II); + + Type *AllocaTy = NewAI.getAllocatedType(); + Type *ScalarTy = AllocaTy->getScalarType(); + + // If this doesn't map cleanly onto the alloca type, and that type isn't + // a single value type, just emit a memset. + if (!VecTy && !IntTy && + (BeginOffset != NewAllocaBeginOffset || + EndOffset != NewAllocaEndOffset || + !AllocaTy->isSingleValueType() || + !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) || + TD.getTypeSizeInBits(ScalarTy)%8 != 0)) { + Type *SizeTy = II.getLength()->getType(); + Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); + CallInst *New + = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, + II.getRawDest()->getType()), + II.getValue(), Size, getPartitionAlign(), + II.isVolatile()); + (void)New; + DEBUG(dbgs() << " to: " << *New << "\n"); + return false; + } + + // If we can represent this as a simple value, we have to build the actual + // value to store, which requires expanding the byte present in memset to + // a sensible representation for the alloca type. This is essentially + // splatting the byte to a sufficiently wide integer, splatting it across + // any desired vector width, and bitcasting to the final type. + Value *V; + + if (VecTy) { + // If this is a memset of a vectorized alloca, insert it. + assert(ElementTy == ScalarTy); + + unsigned BeginIndex = getIndex(BeginOffset); + unsigned EndIndex = getIndex(EndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + + Value *Splat = getIntegerSplat(IRB, II.getValue(), + TD.getTypeSizeInBits(ElementTy)/8); + Splat = convertValue(TD, IRB, Splat, ElementTy); + if (NumElements > 1) + Splat = getVectorSplat(IRB, Splat, NumElements); + + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + V = insertVector(IRB, Old, Splat, BeginIndex, getName(".vec")); + } else if (IntTy) { + // If this is a memset on an alloca where we can widen stores, insert the + // set integer. + assert(!II.isVolatile()); + + uint64_t Size = EndOffset - BeginOffset; + V = getIntegerSplat(IRB, II.getValue(), Size); + + if (IntTy && (BeginOffset != NewAllocaBeginOffset || + EndOffset != NewAllocaBeginOffset)) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); + } else { + assert(V->getType() == IntTy && + "Wrong type for an alloca wide integer!"); + } + V = convertValue(TD, IRB, V, AllocaTy); + } else { + // Established these invariants above. + assert(BeginOffset == NewAllocaBeginOffset); + assert(EndOffset == NewAllocaEndOffset); + + V = getIntegerSplat(IRB, II.getValue(), + TD.getTypeSizeInBits(ScalarTy)/8); + if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) + V = getVectorSplat(IRB, V, AllocaVecTy->getNumElements()); + + V = convertValue(TD, IRB, V, AllocaTy); + } + + Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), + II.isVolatile()); + (void)New; + DEBUG(dbgs() << " to: " << *New << "\n"); + return !II.isVolatile(); + } + + bool visitMemTransferInst(MemTransferInst &II) { + // Rewriting of memory transfer instructions can be a bit tricky. We break + // them into two categories: split intrinsics and unsplit intrinsics. + + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + + assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); + bool IsDest = II.getRawDest() == OldPtr; + + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(II); + + // Compute the relative offset within the transfer. + unsigned IntPtrWidth = TD.getPointerSizeInBits(); + APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin + : MTO.SourceBegin)); + + unsigned Align = II.getAlignment(); + if (Align > 1) + Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), + MinAlign(II.getAlignment(), getPartitionAlign())); + + // For unsplit intrinsics, we simply modify the source and destination + // pointers in place. This isn't just an optimization, it is a matter of + // correctness. With unsplit intrinsics we may be dealing with transfers + // within a single alloca before SROA ran, or with transfers that have + // a variable length. We may also be dealing with memmove instead of + // memcpy, and so simply updating the pointers is the necessary for us to + // update both source and dest of a single call. + if (!MTO.IsSplittable) { + Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); + if (IsDest) + II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + else + II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); + + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment(ConstantInt::get(CstTy, Align)); + + DEBUG(dbgs() << " to: " << II << "\n"); + deleteIfTriviallyDead(OldOp); + return false; + } + // For split transfer intrinsics we have an incredibly useful assurance: + // the source and destination do not reside within the same alloca, and at + // least one of them does not escape. This means that we can replace + // memmove with memcpy, and we don't need to worry about all manner of + // downsides to splitting and transforming the operations. + + // If this doesn't map cleanly onto the alloca type, and that type isn't + // a single value type, just emit a memcpy. + bool EmitMemCpy + = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || + EndOffset != NewAllocaEndOffset || + !NewAI.getAllocatedType()->isSingleValueType()); + + // If we're just going to emit a memcpy, the alloca hasn't changed, and the + // size hasn't been shrunk based on analysis of the viable range, this is + // a no-op. + if (EmitMemCpy && &OldAI == &NewAI) { + uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; + uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; + // Ensure the start lines up. + assert(BeginOffset == OrigBegin); + (void)OrigBegin; + + // Rewrite the size as needed. + if (EndOffset != OrigEnd) + II.setLength(ConstantInt::get(II.getLength()->getType(), + EndOffset - BeginOffset)); + return false; + } + // Record this instruction for deletion. + Pass.DeadInsts.insert(&II); + + // Strip all inbounds GEPs and pointer casts to try to dig out any root + // alloca that should be re-examined after rewriting this instruction. + Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); + if (AllocaInst *AI + = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) + Pass.Worklist.insert(AI); + + if (EmitMemCpy) { + Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() + : II.getRawDest()->getType(); + + // Compute the other pointer, folding as much as possible to produce + // a single, simple GEP in most cases. + OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, + getName("." + OtherPtr->getName())); + + Value *OurPtr + = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() + : II.getRawSource()->getType()); + Type *SizeTy = II.getLength()->getType(); + Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); + + CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, + IsDest ? OtherPtr : OurPtr, + Size, Align, II.isVolatile()); + (void)New; + DEBUG(dbgs() << " to: " << *New << "\n"); + return false; + } + + // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy + // is equivalent to 1, but that isn't true if we end up rewriting this as + // a load or store. + if (!Align) + Align = 1; + + bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && + EndOffset == NewAllocaEndOffset; + uint64_t Size = EndOffset - BeginOffset; + unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0; + unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0; + unsigned NumElements = EndIndex - BeginIndex; + IntegerType *SubIntTy + = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; + + Type *OtherPtrTy = NewAI.getType(); + if (VecTy && !IsWholeAlloca) { + if (NumElements == 1) + OtherPtrTy = VecTy->getElementType(); + else + OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); + + OtherPtrTy = OtherPtrTy->getPointerTo(); + } else if (IntTy && !IsWholeAlloca) { + OtherPtrTy = SubIntTy->getPointerTo(); + } + + Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, + getName("." + OtherPtr->getName())); + Value *DstPtr = &NewAI; + if (!IsDest) + std::swap(SrcPtr, DstPtr); + + Value *Src; + if (VecTy && !IsWholeAlloca && !IsDest) { + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + Src = extractVector(IRB, Src, BeginIndex, EndIndex, getName(".vec")); + } else if (IntTy && !IsWholeAlloca && !IsDest) { + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + Src = convertValue(TD, IRB, Src, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); + } else { + Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), + getName(".copyload")); + } + + if (VecTy && !IsWholeAlloca && IsDest) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Src = insertVector(IRB, Old, Src, BeginIndex, getName(".vec")); + } else if (IntTy && !IsWholeAlloca && IsDest) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); + Src = convertValue(TD, IRB, Src, NewAllocaTy); + } + + StoreInst *Store = cast<StoreInst>( + IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return !II.isVolatile(); + } + + bool visitIntrinsicInst(IntrinsicInst &II) { + assert(II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end); + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + assert(II.getArgOperand(1) == OldPtr); + + // Record this instruction for deletion. + Pass.DeadInsts.insert(&II); + + ConstantInt *Size + = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), + EndOffset - BeginOffset); + Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); + Value *New; + if (II.getIntrinsicID() == Intrinsic::lifetime_start) + New = IRB.CreateLifetimeStart(Ptr, Size); + else + New = IRB.CreateLifetimeEnd(Ptr, Size); + + DEBUG(dbgs() << " to: " << *New << "\n"); + return true; + } + + bool visitPHINode(PHINode &PN) { + DEBUG(dbgs() << " original: " << PN << "\n"); + + // We would like to compute a new pointer in only one place, but have it be + // as local as possible to the PHI. To do that, we re-use the location of + // the old pointer, which necessarily must be in the right position to + // dominate the PHI. + IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr)); + + Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); + // Replace the operands which were using the old pointer. + std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); + + DEBUG(dbgs() << " to: " << PN << "\n"); + deleteIfTriviallyDead(OldPtr); + return false; + } + + bool visitSelectInst(SelectInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + IRBuilder<> IRB(&SI); + + // Find the operand we need to rewrite here. + bool IsTrueVal = SI.getTrueValue() == OldPtr; + if (IsTrueVal) + assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); + else + assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); + + Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); + SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); + DEBUG(dbgs() << " to: " << SI << "\n"); + deleteIfTriviallyDead(OldPtr); + return false; + } + +}; +} + +namespace { +/// \brief Visitor to rewrite aggregate loads and stores as scalar. +/// +/// This pass aggressively rewrites all aggregate loads and stores on +/// a particular pointer (or any pointer derived from it which we can identify) +/// with scalar loads and stores. +class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; + + const DataLayout &TD; + + /// Queue of pointer uses to analyze and potentially rewrite. + SmallVector<Use *, 8> Queue; + + /// Set to prevent us from cycling with phi nodes and loops. + SmallPtrSet<User *, 8> Visited; + + /// The current pointer use being rewritten. This is used to dig up the used + /// value (as opposed to the user). + Use *U; + +public: + AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} + + /// Rewrite loads and stores through a pointer and all pointers derived from + /// it. + bool rewrite(Instruction &I) { + DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); + enqueueUsers(I); + bool Changed = false; + while (!Queue.empty()) { + U = Queue.pop_back_val(); + Changed |= visit(cast<Instruction>(U->getUser())); + } + return Changed; + } + +private: + /// Enqueue all the users of the given instruction for further processing. + /// This uses a set to de-duplicate users. + void enqueueUsers(Instruction &I) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; + ++UI) + if (Visited.insert(*UI)) + Queue.push_back(&UI.getUse()); + } + + // Conservative default is to not rewrite anything. + bool visitInstruction(Instruction &I) { return false; } + + /// \brief Generic recursive split emission class. + template <typename Derived> + class OpSplitter { + protected: + /// The builder used to form new instructions. + IRBuilder<> IRB; + /// The indices which to be used with insert- or extractvalue to select the + /// appropriate value within the aggregate. + SmallVector<unsigned, 4> Indices; + /// The indices to a GEP instruction which will move Ptr to the correct slot + /// within the aggregate. + SmallVector<Value *, 4> GEPIndices; + /// The base pointer of the original op, used as a base for GEPing the + /// split operations. + Value *Ptr; + + /// Initialize the splitter with an insertion point, Ptr and start with a + /// single zero GEP index. + OpSplitter(Instruction *InsertionPoint, Value *Ptr) + : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} + + public: + /// \brief Generic recursive split emission routine. + /// + /// This method recursively splits an aggregate op (load or store) into + /// scalar or vector ops. It splits recursively until it hits a single value + /// and emits that single value operation via the template argument. + /// + /// The logic of this routine relies on GEPs and insertvalue and + /// extractvalue all operating with the same fundamental index list, merely + /// formatted differently (GEPs need actual values). + /// + /// \param Ty The type being split recursively into smaller ops. + /// \param Agg The aggregate value being built up or stored, depending on + /// whether this is splitting a load or a store respectively. + void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { + if (Ty->isSingleValueType()) + return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); + + if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; + ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; + } + + if (StructType *STy = dyn_cast<StructType>(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; + ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; + } + + llvm_unreachable("Only arrays and structs are aggregate loadable types"); + } + }; + + struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { + LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) + : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} + + /// Emit a leaf load of a single value. This is called at the leaves of the + /// recursive emission to actually load values. + void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + assert(Ty->isSingleValueType()); + // Load the single value and insert it using the indices. + Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, + Name + ".gep"), + Name + ".load"); + Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); + DEBUG(dbgs() << " to: " << *Load << "\n"); + } + }; + + bool visitLoadInst(LoadInst &LI) { + assert(LI.getPointerOperand() == *U); + if (!LI.isSimple() || LI.getType()->isSingleValueType()) + return false; + + // We have an aggregate being loaded, split it apart. + DEBUG(dbgs() << " original: " << LI << "\n"); + LoadOpSplitter Splitter(&LI, *U); + Value *V = UndefValue::get(LI.getType()); + Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); + LI.replaceAllUsesWith(V); + LI.eraseFromParent(); + return true; + } + + struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { + StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) + : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} + + /// Emit a leaf store of a single value. This is called at the leaves of the + /// recursive emission to actually produce stores. + void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + assert(Ty->isSingleValueType()); + // Extract the single value and store it using the indices. + Value *Store = IRB.CreateStore( + IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), + IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + } + }; + + bool visitStoreInst(StoreInst &SI) { + if (!SI.isSimple() || SI.getPointerOperand() != *U) + return false; + Value *V = SI.getValueOperand(); + if (V->getType()->isSingleValueType()) + return false; + + // We have an aggregate being stored, split it apart. + DEBUG(dbgs() << " original: " << SI << "\n"); + StoreOpSplitter Splitter(&SI, *U); + Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); + SI.eraseFromParent(); + return true; + } + + bool visitBitCastInst(BitCastInst &BC) { + enqueueUsers(BC); + return false; + } + + bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { + enqueueUsers(GEPI); + return false; + } + + bool visitPHINode(PHINode &PN) { + enqueueUsers(PN); + return false; + } + + bool visitSelectInst(SelectInst &SI) { + enqueueUsers(SI); + return false; + } +}; +} + +/// \brief Strip aggregate type wrapping. +/// +/// This removes no-op aggregate types wrapping an underlying type. It will +/// strip as many layers of types as it can without changing either the type +/// size or the allocated size. +static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { + if (Ty->isSingleValueType()) + return Ty; + + uint64_t AllocSize = DL.getTypeAllocSize(Ty); + uint64_t TypeSize = DL.getTypeSizeInBits(Ty); + + Type *InnerTy; + if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { + InnerTy = ArrTy->getElementType(); + } else if (StructType *STy = dyn_cast<StructType>(Ty)) { + const StructLayout *SL = DL.getStructLayout(STy); + unsigned Index = SL->getElementContainingOffset(0); + InnerTy = STy->getElementType(Index); + } else { + return Ty; + } + + if (AllocSize > DL.getTypeAllocSize(InnerTy) || + TypeSize > DL.getTypeSizeInBits(InnerTy)) + return Ty; + + return stripAggregateTypeWrapping(DL, InnerTy); +} + +/// \brief Try to find a partition of the aggregate type passed in for a given +/// offset and size. +/// +/// This recurses through the aggregate type and tries to compute a subtype +/// based on the offset and size. When the offset and size span a sub-section +/// of an array, it will even compute a new array type for that sub-section, +/// and the same for structs. +/// +/// Note that this routine is very strict and tries to find a partition of the +/// type which produces the *exact* right offset and size. It is not forgiving +/// when the size or offset cause either end of type-based partition to be off. +/// Also, this is a best-effort routine. It is reasonable to give up and not +/// return a type if necessary. +static Type *getTypePartition(const DataLayout &TD, Type *Ty, + uint64_t Offset, uint64_t Size) { + if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) + return stripAggregateTypeWrapping(TD, Ty); + if (Offset > TD.getTypeAllocSize(Ty) || + (TD.getTypeAllocSize(Ty) - Offset) < Size) + return 0; + + if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { + // We can't partition pointers... + if (SeqTy->isPointerTy()) + return 0; + + Type *ElementTy = SeqTy->getElementType(); + uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + uint64_t NumSkippedElements = Offset / ElementSize; + if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) + if (NumSkippedElements >= ArrTy->getNumElements()) + return 0; + if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) + if (NumSkippedElements >= VecTy->getNumElements()) + return 0; + Offset -= NumSkippedElements * ElementSize; + + // First check if we need to recurse. + if (Offset > 0 || Size < ElementSize) { + // Bail if the partition ends in a different array element. + if ((Offset + Size) > ElementSize) + return 0; + // Recurse through the element type trying to peel off offset bytes. + return getTypePartition(TD, ElementTy, Offset, Size); + } + assert(Offset == 0); + + if (Size == ElementSize) + return stripAggregateTypeWrapping(TD, ElementTy); + assert(Size > ElementSize); + uint64_t NumElements = Size / ElementSize; + if (NumElements * ElementSize != Size) + return 0; + return ArrayType::get(ElementTy, NumElements); + } + + StructType *STy = dyn_cast<StructType>(Ty); + if (!STy) + return 0; + + const StructLayout *SL = TD.getStructLayout(STy); + if (Offset >= SL->getSizeInBytes()) + return 0; + uint64_t EndOffset = Offset + Size; + if (EndOffset > SL->getSizeInBytes()) + return 0; + + unsigned Index = SL->getElementContainingOffset(Offset); + Offset -= SL->getElementOffset(Index); + + Type *ElementTy = STy->getElementType(Index); + uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + if (Offset >= ElementSize) + return 0; // The offset points into alignment padding. + + // See if any partition must be contained by the element. + if (Offset > 0 || Size < ElementSize) { + if ((Offset + Size) > ElementSize) + return 0; + return getTypePartition(TD, ElementTy, Offset, Size); + } + assert(Offset == 0); + + if (Size == ElementSize) + return stripAggregateTypeWrapping(TD, ElementTy); + + StructType::element_iterator EI = STy->element_begin() + Index, + EE = STy->element_end(); + if (EndOffset < SL->getSizeInBytes()) { + unsigned EndIndex = SL->getElementContainingOffset(EndOffset); + if (Index == EndIndex) + return 0; // Within a single element and its padding. + + // Don't try to form "natural" types if the elements don't line up with the + // expected size. + // FIXME: We could potentially recurse down through the last element in the + // sub-struct to find a natural end point. + if (SL->getElementOffset(EndIndex) != EndOffset) + return 0; + + assert(Index < EndIndex); + EE = STy->element_begin() + EndIndex; + } + + // Try to build up a sub-structure. + StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), + STy->isPacked()); + const StructLayout *SubSL = TD.getStructLayout(SubTy); + if (Size != SubSL->getSizeInBytes()) + return 0; // The sub-struct doesn't have quite the size needed. + + return SubTy; +} + +/// \brief Rewrite an alloca partition's users. +/// +/// This routine drives both of the rewriting goals of the SROA pass. It tries +/// to rewrite uses of an alloca partition to be conducive for SSA value +/// promotion. If the partition needs a new, more refined alloca, this will +/// build that new alloca, preserving as much type information as possible, and +/// rewrite the uses of the old alloca to point at the new one and have the +/// appropriate new offsets. It also evaluates how successful the rewrite was +/// at enabling promotion and if it was successful queues the alloca to be +/// promoted. +bool SROA::rewriteAllocaPartition(AllocaInst &AI, + AllocaPartitioning &P, + AllocaPartitioning::iterator PI) { + uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; + bool IsLive = false; + for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), + UE = P.use_end(PI); + UI != UE && !IsLive; ++UI) + if (UI->U) + IsLive = true; + if (!IsLive) + return false; // No live uses left of this partition. + + DEBUG(dbgs() << "Speculating PHIs and selects in partition " + << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); + + PHIOrSelectSpeculator Speculator(*TD, P, *this); + DEBUG(dbgs() << " speculating "); + DEBUG(P.print(dbgs(), PI, "")); + Speculator.visitUsers(PI); + + // Try to compute a friendly type for this partition of the alloca. This + // won't always succeed, in which case we fall back to a legal integer type + // or an i8 array of an appropriate size. + Type *AllocaTy = 0; + if (Type *PartitionTy = P.getCommonType(PI)) + if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) + AllocaTy = PartitionTy; + if (!AllocaTy) + if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), + PI->BeginOffset, AllocaSize)) + AllocaTy = PartitionTy; + if ((!AllocaTy || + (AllocaTy->isArrayTy() && + AllocaTy->getArrayElementType()->isIntegerTy())) && + TD->isLegalInteger(AllocaSize * 8)) + AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); + if (!AllocaTy) + AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); + assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); + + // Check for the case where we're going to rewrite to a new alloca of the + // exact same type as the original, and with the same access offsets. In that + // case, re-use the existing alloca, but still run through the rewriter to + // performe phi and select speculation. + AllocaInst *NewAI; + if (AllocaTy == AI.getAllocatedType()) { + assert(PI->BeginOffset == 0 && + "Non-zero begin offset but same alloca type"); + assert(PI == P.begin() && "Begin offset is zero on later partition"); + NewAI = &AI; + } else { + unsigned Alignment = AI.getAlignment(); + if (!Alignment) { + // The minimum alignment which users can rely on when the explicit + // alignment is omitted or zero is that required by the ABI for this + // type. + Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); + } + Alignment = MinAlign(Alignment, PI->BeginOffset); + // If we will get at least this much alignment from the type alone, leave + // the alloca's alignment unconstrained. + if (Alignment <= TD->getABITypeAlignment(AllocaTy)) + Alignment = 0; + NewAI = new AllocaInst(AllocaTy, 0, Alignment, + AI.getName() + ".sroa." + Twine(PI - P.begin()), + &AI); + ++NumNewAllocas; + } + + DEBUG(dbgs() << "Rewriting alloca partition " + << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " + << *NewAI << "\n"); + + // Track the high watermark of the post-promotion worklist. We will reset it + // to this point if the alloca is not in fact scheduled for promotion. + unsigned PPWOldSize = PostPromotionWorklist.size(); + + AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, + PI->BeginOffset, PI->EndOffset); + DEBUG(dbgs() << " rewriting "); + DEBUG(P.print(dbgs(), PI, "")); + bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); + if (Promotable) { + DEBUG(dbgs() << " and queuing for promotion\n"); + PromotableAllocas.push_back(NewAI); + } else if (NewAI != &AI) { + // If we can't promote the alloca, iterate on it to check for new + // refinements exposed by splitting the current alloca. Don't iterate on an + // alloca which didn't actually change and didn't get promoted. + Worklist.insert(NewAI); + } + + // Drop any post-promotion work items if promotion didn't happen. + if (!Promotable) + while (PostPromotionWorklist.size() > PPWOldSize) + PostPromotionWorklist.pop_back(); + + return true; +} + +/// \brief Walks the partitioning of an alloca rewriting uses of each partition. +bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { + bool Changed = false; + for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; + ++PI) + Changed |= rewriteAllocaPartition(AI, P, PI); + + return Changed; +} + +/// \brief Analyze an alloca for SROA. +/// +/// This analyzes the alloca to ensure we can reason about it, builds +/// a partitioning of the alloca, and then hands it off to be split and +/// rewritten as needed. +bool SROA::runOnAlloca(AllocaInst &AI) { + DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); + ++NumAllocasAnalyzed; + + // Special case dead allocas, as they're trivial. + if (AI.use_empty()) { + AI.eraseFromParent(); + return true; + } + + // Skip alloca forms that this analysis can't handle. + if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || + TD->getTypeAllocSize(AI.getAllocatedType()) == 0) + return false; + + bool Changed = false; + + // First, split any FCA loads and stores touching this alloca to promote + // better splitting and promotion opportunities. + AggLoadStoreRewriter AggRewriter(*TD); + Changed |= AggRewriter.rewrite(AI); + + // Build the partition set using a recursive instruction-visiting builder. + AllocaPartitioning P(*TD, AI); + DEBUG(P.print(dbgs())); + if (P.isEscaped()) + return Changed; + + // Delete all the dead users of this alloca before splitting and rewriting it. + for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), + DE = P.dead_user_end(); + DI != DE; ++DI) { + Changed = true; + (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); + DeadInsts.insert(*DI); + } + for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), + DE = P.dead_op_end(); + DO != DE; ++DO) { + Value *OldV = **DO; + // Clobber the use with an undef value. + **DO = UndefValue::get(OldV->getType()); + if (Instruction *OldI = dyn_cast<Instruction>(OldV)) + if (isInstructionTriviallyDead(OldI)) { + Changed = true; + DeadInsts.insert(OldI); + } + } + + // No partitions to split. Leave the dead alloca for a later pass to clean up. + if (P.begin() == P.end()) + return Changed; + + return splitAlloca(AI, P) || Changed; +} + +/// \brief Delete the dead instructions accumulated in this run. +/// +/// Recursively deletes the dead instructions we've accumulated. This is done +/// at the very end to maximize locality of the recursive delete and to +/// minimize the problems of invalidated instruction pointers as such pointers +/// are used heavily in the intermediate stages of the algorithm. +/// +/// We also record the alloca instructions deleted here so that they aren't +/// subsequently handed to mem2reg to promote. +void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { + while (!DeadInsts.empty()) { + Instruction *I = DeadInsts.pop_back_val(); + DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); + + I->replaceAllUsesWith(UndefValue::get(I->getType())); + + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *U = dyn_cast<Instruction>(*OI)) { + // Zero out the operand and see if it becomes trivially dead. + *OI = 0; + if (isInstructionTriviallyDead(U)) + DeadInsts.insert(U); + } + + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + DeletedAllocas.insert(AI); + + ++NumDeleted; + I->eraseFromParent(); + } +} + +/// \brief Promote the allocas, using the best available technique. +/// +/// This attempts to promote whatever allocas have been identified as viable in +/// the PromotableAllocas list. If that list is empty, there is nothing to do. +/// If there is a domtree available, we attempt to promote using the full power +/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is +/// based on the SSAUpdater utilities. This function returns whether any +/// promotion occured. +bool SROA::promoteAllocas(Function &F) { + if (PromotableAllocas.empty()) + return false; + + NumPromoted += PromotableAllocas.size(); + + if (DT && !ForceSSAUpdater) { + DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); + PromoteMemToReg(PromotableAllocas, *DT); + PromotableAllocas.clear(); + return true; + } + + DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); + SSAUpdater SSA; + DIBuilder DIB(*F.getParent()); + SmallVector<Instruction*, 64> Insts; + + for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { + AllocaInst *AI = PromotableAllocas[Idx]; + for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); + UI != UE;) { + Instruction *I = cast<Instruction>(*UI++); + // FIXME: Currently the SSAUpdater infrastructure doesn't reason about + // lifetime intrinsics and so we strip them (and the bitcasts+GEPs + // leading to them) here. Eventually it should use them to optimize the + // scalar values produced. + if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { + assert(onlyUsedByLifetimeMarkers(I) && + "Found a bitcast used outside of a lifetime marker."); + while (!I->use_empty()) + cast<Instruction>(*I->use_begin())->eraseFromParent(); + I->eraseFromParent(); + continue; + } + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + assert(II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end); + II->eraseFromParent(); + continue; + } + + Insts.push_back(I); + } + AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); + Insts.clear(); + } + + PromotableAllocas.clear(); + return true; +} + +namespace { + /// \brief A predicate to test whether an alloca belongs to a set. + class IsAllocaInSet { + typedef SmallPtrSet<AllocaInst *, 4> SetType; + const SetType &Set; + + public: + typedef AllocaInst *argument_type; + + IsAllocaInSet(const SetType &Set) : Set(Set) {} + bool operator()(AllocaInst *AI) const { return Set.count(AI); } + }; +} + +bool SROA::runOnFunction(Function &F) { + DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); + C = &F.getContext(); + TD = getAnalysisIfAvailable<DataLayout>(); + if (!TD) { + DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); + return false; + } + DT = getAnalysisIfAvailable<DominatorTree>(); + + BasicBlock &EntryBB = F.getEntryBlock(); + for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); + I != E; ++I) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + Worklist.insert(AI); + + bool Changed = false; + // A set of deleted alloca instruction pointers which should be removed from + // the list of promotable allocas. + SmallPtrSet<AllocaInst *, 4> DeletedAllocas; + + do { + while (!Worklist.empty()) { + Changed |= runOnAlloca(*Worklist.pop_back_val()); + deleteDeadInstructions(DeletedAllocas); + + // Remove the deleted allocas from various lists so that we don't try to + // continue processing them. + if (!DeletedAllocas.empty()) { + Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); + PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); + PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), + PromotableAllocas.end(), + IsAllocaInSet(DeletedAllocas)), + PromotableAllocas.end()); + DeletedAllocas.clear(); + } + } + + Changed |= promoteAllocas(F); + + Worklist = PostPromotionWorklist; + PostPromotionWorklist.clear(); + } while (!Worklist.empty()); + + return Changed; +} + +void SROA::getAnalysisUsage(AnalysisUsage &AU) const { + if (RequiresDomTree) + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); +} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 48318c8..35d2fa0 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -13,14 +13,14 @@ // //===----------------------------------------------------------------------===// -#include "llvm-c/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar.h" #include "llvm-c/Initialization.h" -#include "llvm/InitializePasses.h" -#include "llvm/PassManager.h" +#include "llvm-c/Transforms/Scalar.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/Verifier.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Scalar.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassManager.h" using namespace llvm; @@ -59,6 +59,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeRegToMemPass(Registry); initializeSCCPPass(Registry); initializeIPSCCPPass(Registry); + initializeSROAPass(Registry); initializeSROA_DTPass(Registry); initializeSROA_SSAUpPass(Registry); initializeCFGSimplifyPassPass(Registry); diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 8090fdf..e590a37 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -21,32 +21,32 @@ #define DEBUG_TYPE "scalarrepl" #include "llvm/Transforms/Scalar.h" -#include "llvm/Constants.h" -#include "llvm/DIBuilder.h" -#include "llvm/DebugInfo.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/GlobalVariable.h" -#include "llvm/IRBuilder.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Operator.h" -#include "llvm/Pass.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -87,7 +87,7 @@ namespace { private: bool HasDomTree; - TargetData *TD; + DataLayout *TD; /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. @@ -258,7 +258,7 @@ namespace { class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered in bytes. unsigned AllocaSize; - const TargetData &TD; + const DataLayout &TD; unsigned ScalarLoadThreshold; /// IsNotTrivial - This is set to true if there is some access to the object @@ -301,7 +301,7 @@ class ConvertToScalarInfo { bool HadDynamicAccess; public: - explicit ConvertToScalarInfo(unsigned Size, const TargetData &td, + explicit ConvertToScalarInfo(unsigned Size, const DataLayout &td, unsigned SLT) : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false), ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false), @@ -1020,11 +1020,11 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, bool SROA::runOnFunction(Function &F) { - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); bool Changed = performPromotion(F); - // FIXME: ScalarRepl currently depends on TargetData more than it + // FIXME: ScalarRepl currently depends on DataLayout more than it // theoretically needs to. It should be refactored in order to support // target-independent IR. Until this is done, just skip the actual // scalar-replacement portion of this pass. @@ -1134,7 +1134,7 @@ public: /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { +static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) { bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); @@ -1172,7 +1172,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { +static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) { // For now, we can only do this promotion if the load is in the same block as // the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1236,7 +1236,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { /// direct (non-volatile) loads and stores to it. If the alloca is close but /// not quite there, this will transform the code to allow promotion. As such, /// it is a non-pure predicate. -static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { SetVector<Instruction*, SmallVector<Instruction*, 4>, SmallPtrSet<Instruction*, 4> > InstsToRewrite; @@ -2537,7 +2537,7 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, /// HasPadding - Return true if the specified type has any structure or /// alignment padding in between the elements that would be split apart /// by SROA; return false otherwise. -static bool HasPadding(Type *Ty, const TargetData &TD) { +static bool HasPadding(Type *Ty, const DataLayout &TD) { if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { Ty = ATy->getElementType(); return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 6d27db1..c243d34 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -23,18 +23,19 @@ #define DEBUG_TYPE "simplifycfg" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Constants.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Module.h" -#include "llvm/Attributes.h" -#include "llvm/Support/CFG.h" -#include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(NumSimpl, "Number of blocks simplified"); @@ -47,12 +48,19 @@ namespace { } virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetTransformInfo>(); + } }; } char CFGSimplifyPass::ID = 0; -INITIALIZE_PASS(CFGSimplifyPass, "simplifycfg", - "Simplify the CFG", false, false) +INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", + false, false) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", + false, false) // Public interface to the CFGSimplification pass FunctionPass *llvm::createCFGSimplificationPass() { @@ -110,13 +118,11 @@ static bool markAliveBlocks(BasicBlock *BB, SmallVector<BasicBlock*, 128> Worklist; Worklist.push_back(BB); + Reachable.insert(BB); bool Changed = false; do { BB = Worklist.pop_back_val(); - if (!Reachable.insert(BB)) - continue; - // Do a quick scan of the basic block, turning any obviously unreachable // instructions into LLVM unreachable insts. The instruction combining pass // canonicalizes unreachable insts into stores to null or undef. @@ -175,7 +181,8 @@ static bool markAliveBlocks(BasicBlock *BB, Changed |= ConstantFoldTerminator(BB, true); for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - Worklist.push_back(*SI); + if (Reachable.insert(*SI)) + Worklist.push_back(*SI); } while (!Worklist.empty()); return Changed; } @@ -293,7 +300,8 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. -static bool iterativelySimplifyCFG(Function &F, const TargetData *TD) { +static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, + const DataLayout *TD) { bool Changed = false; bool LocalChange = true; while (LocalChange) { @@ -302,7 +310,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetData *TD) { // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TD)) { + if (SimplifyCFG(BBIt++, TTI, TD)) { LocalChange = true; ++NumSimpl; } @@ -316,10 +324,11 @@ static bool iterativelySimplifyCFG(Function &F, const TargetData *TD) { // simplify the CFG. // bool CFGSimplifyPass::runOnFunction(Function &F) { - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); bool EverChanged = removeUnreachableBlocksFromFn(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TD); + EverChanged |= iterativelySimplifyCFG(F, TTI, TD); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -333,7 +342,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { return true; do { - EverChanged = iterativelySimplifyCFG(F, TD); + EverChanged = iterativelySimplifyCFG(F, TTI, TD); EverChanged |= removeUnreachableBlocksFromFn(F); } while (EverChanged); diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index 65311fe..d5cefa3 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -17,32 +17,26 @@ #define DEBUG_TYPE "simplify-libcalls" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BuildLibCalls.h" -#include "llvm/IRBuilder.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringMap.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! +#include "llvm/Transforms/Utils/BuildLibCalls.h" using namespace llvm; -STATISTIC(NumSimplified, "Number of library calls simplified"); STATISTIC(NumAnnotated, "Number of attributes added to library functions"); -static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden, - cl::init(false), - cl::desc("Enable unsafe double to float " - "shrinking for math lib calls")); //===----------------------------------------------------------------------===// // Optimizer Base Class //===----------------------------------------------------------------------===// @@ -53,7 +47,7 @@ namespace { class LibCallOptimization { protected: Function *Caller; - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; LLVMContext* Context; public: @@ -68,7 +62,7 @@ public: virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) =0; - Value *OptimizeCall(CallInst *CI, const TargetData *TD, + Value *OptimizeCall(CallInst *CI, const DataLayout *TD, const TargetLibraryInfo *TLI, IRBuilder<> &B) { Caller = CI->getParent()->getParent(); this->TD = TD; @@ -87,1470 +81,6 @@ public: //===----------------------------------------------------------------------===// -// Helper Functions -//===----------------------------------------------------------------------===// - -/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the -/// value is equal or not-equal to zero. -static bool IsOnlyUsedInZeroEqualityComparison(Value *V) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); - UI != E; ++UI) { - if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) - if (IC->isEquality()) - if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) - if (C->isNullValue()) - continue; - // Unknown instruction. - return false; - } - return true; -} - -static bool CallHasFloatingPointArgument(const CallInst *CI) { - for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); - it != e; ++it) { - if ((*it)->getType()->isFloatingPointTy()) - return true; - } - return false; -} - -/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality -/// comparisons with With. -static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); - UI != E; ++UI) { - if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) - if (IC->isEquality() && IC->getOperand(1) == With) - continue; - // Unknown instruction. - return false; - } - return true; -} - -//===----------------------------------------------------------------------===// -// String and Memory LibCall Optimizations -//===----------------------------------------------------------------------===// - -//===---------------------------------------===// -// 'strcat' Optimizations -namespace { -struct StrCatOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcat" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - FT->getParamType(1) != FT->getReturnType()) - return 0; - - // Extract some information from the instruction - Value *Dst = CI->getArgOperand(0); - Value *Src = CI->getArgOperand(1); - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - --Len; // Unbias length. - - // Handle the simple, do-nothing case: strcat(x, "") -> x - if (Len == 0) - return Dst; - - // These optimizations require TargetData. - if (!TD) return 0; - - return EmitStrLenMemCpy(Src, Dst, Len, B); - } - - Value *EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) { - // We need to find the end of the destination string. That's where the - // memory is to be moved to. We just generate a call to strlen. - Value *DstLen = EmitStrLen(Dst, B, TD, TLI); - if (!DstLen) - return 0; - - // Now that we have the destination's length, we must index into the - // destination's pointer to get the actual memcpy destination (end of - // the string .. we're concatenating). - Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr"); - - // We have enough information to now generate the memcpy call to do the - // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - B.CreateMemCpy(CpyDst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1); - return Dst; - } -}; - -//===---------------------------------------===// -// 'strncat' Optimizations - -struct StrNCatOpt : public StrCatOpt { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strncat" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - FT->getParamType(1) != FT->getReturnType() || - !FT->getParamType(2)->isIntegerTy()) - return 0; - - // Extract some information from the instruction - Value *Dst = CI->getArgOperand(0); - Value *Src = CI->getArgOperand(1); - uint64_t Len; - - // We don't do anything if length is not constant - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) - Len = LengthArg->getZExtValue(); - else - return 0; - - // See if we can get the length of the input string. - uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) return 0; - --SrcLen; // Unbias length. - - // Handle the simple, do-nothing cases: - // strncat(x, "", c) -> x - // strncat(x, c, 0) -> x - if (SrcLen == 0 || Len == 0) return Dst; - - // These optimizations require TargetData. - if (!TD) return 0; - - // We don't optimize this case - if (Len < SrcLen) return 0; - - // strncat(x, s, c) -> strcat(x, s) - // s is constant so the strcat can be optimized further - return EmitStrLenMemCpy(Src, Dst, SrcLen, B); - } -}; - -//===---------------------------------------===// -// 'strchr' Optimizations - -struct StrChrOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strchr" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - !FT->getParamType(1)->isIntegerTy(32)) - return 0; - - Value *SrcStr = CI->getArgOperand(0); - - // If the second operand is non-constant, see if we can compute the length - // of the input string and turn this into memchr. - ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - if (CharC == 0) { - // These optimizations require TargetData. - if (!TD) return 0; - - uint64_t Len = GetStringLength(SrcStr); - if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32. - return 0; - - return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. - ConstantInt::get(TD->getIntPtrType(*Context), Len), - B, TD, TLI); - } - - // Otherwise, the character is a constant, see if the first argument is - // a string literal. If so, we can constant fold. - StringRef Str; - if (!getConstantStringInfo(SrcStr, Str)) - return 0; - - // Compute the offset, make sure to handle the case when we're searching for - // zero (a weird way to spell strlen). - size_t I = CharC->getSExtValue() == 0 ? - Str.size() : Str.find(CharC->getSExtValue()); - if (I == StringRef::npos) // Didn't find the char. strchr returns null. - return Constant::getNullValue(CI->getType()); - - // strchr(s+n,c) -> gep(s+n+i,c) - return B.CreateGEP(SrcStr, B.getInt64(I), "strchr"); - } -}; - -//===---------------------------------------===// -// 'strrchr' Optimizations - -struct StrRChrOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strrchr" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - !FT->getParamType(1)->isIntegerTy(32)) - return 0; - - Value *SrcStr = CI->getArgOperand(0); - ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - - // Cannot fold anything if we're not looking for a constant. - if (!CharC) - return 0; - - StringRef Str; - if (!getConstantStringInfo(SrcStr, Str)) { - // strrchr(s, 0) -> strchr(s, 0) - if (TD && CharC->isZero()) - return EmitStrChr(SrcStr, '\0', B, TD, TLI); - return 0; - } - - // Compute the offset. - size_t I = CharC->getSExtValue() == 0 ? - Str.size() : Str.rfind(CharC->getSExtValue()); - if (I == StringRef::npos) // Didn't find the char. Return null. - return Constant::getNullValue(CI->getType()); - - // strrchr(s+n,c) -> gep(s+n+i,c) - return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr"); - } -}; - -//===---------------------------------------===// -// 'strcmp' Optimizations - -struct StrCmpOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcmp" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - !FT->getReturnType()->isIntegerTy(32) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); - if (Str1P == Str2P) // strcmp(x,x) -> 0 - return ConstantInt::get(CI->getType(), 0); - - StringRef Str1, Str2; - bool HasStr1 = getConstantStringInfo(Str1P, Str1); - bool HasStr2 = getConstantStringInfo(Str2P, Str2); - - // strcmp(x, y) -> cnst (if both x and y are constant strings) - if (HasStr1 && HasStr2) - return ConstantInt::get(CI->getType(), Str1.compare(Str2)); - - if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x - return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), - CI->getType())); - - if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x - return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - - // strcmp(P, "x") -> memcmp(P, "x", 2) - uint64_t Len1 = GetStringLength(Str1P); - uint64_t Len2 = GetStringLength(Str2P); - if (Len1 && Len2) { - // These optimizations require TargetData. - if (!TD) return 0; - - return EmitMemCmp(Str1P, Str2P, - ConstantInt::get(TD->getIntPtrType(*Context), - std::min(Len1, Len2)), B, TD, TLI); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'strncmp' Optimizations - -struct StrNCmpOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strncmp" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || - !FT->getReturnType()->isIntegerTy(32) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getParamType(2)->isIntegerTy()) - return 0; - - Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); - if (Str1P == Str2P) // strncmp(x,x,n) -> 0 - return ConstantInt::get(CI->getType(), 0); - - // Get the length argument if it is constant. - uint64_t Length; - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) - Length = LengthArg->getZExtValue(); - else - return 0; - - if (Length == 0) // strncmp(x,y,0) -> 0 - return ConstantInt::get(CI->getType(), 0); - - if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) - return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI); - - StringRef Str1, Str2; - bool HasStr1 = getConstantStringInfo(Str1P, Str1); - bool HasStr2 = getConstantStringInfo(Str2P, Str2); - - // strncmp(x, y) -> cnst (if both x and y are constant strings) - if (HasStr1 && HasStr2) { - StringRef SubStr1 = Str1.substr(0, Length); - StringRef SubStr2 = Str2.substr(0, Length); - return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2)); - } - - if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x - return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), - CI->getType())); - - if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x - return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - - return 0; - } -}; - - -//===---------------------------------------===// -// 'strcpy' Optimizations - -struct StrCpyOpt : public LibCallOptimization { - bool OptChkCall; // True if it's optimizing a __strcpy_chk libcall. - - StrCpyOpt(bool c) : OptChkCall(c) {} - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcpy" function prototype. - unsigned NumParams = OptChkCall ? 3 : 2; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != NumParams || - FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) // strcpy(x,x) -> x - return Src; - - // These optimizations require TargetData. - if (!TD) return 0; - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - - // We have enough information to now generate the memcpy call to do the - // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - if (!OptChkCall || - !EmitMemCpyChk(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), - CI->getArgOperand(2), B, TD, TLI)) - B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); - return Dst; - } -}; - -//===---------------------------------------===// -// 'stpcpy' Optimizations - -struct StpCpyOpt: public LibCallOptimization { - bool OptChkCall; // True if it's optimizing a __stpcpy_chk libcall. - - StpCpyOpt(bool c) : OptChkCall(c) {} - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "stpcpy" function prototype. - unsigned NumParams = OptChkCall ? 3 : 2; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != NumParams || - FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - // These optimizations require TargetData. - if (!TD) return 0; - - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) - Value *StrLen = EmitStrLen(Src, B, TD, TLI); - return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0; - } - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - - Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len); - Value *DstEnd = B.CreateGEP(Dst, - ConstantInt::get(TD->getIntPtrType(*Context), - Len - 1)); - - // We have enough information to now generate the memcpy call to do the - // copy for us. Make a memcpy to copy the nul byte with align = 1. - if (!OptChkCall || !EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, - TD, TLI)) - B.CreateMemCpy(Dst, Src, LenV, 1); - return DstEnd; - } -}; - -//===---------------------------------------===// -// 'strncpy' Optimizations - -struct StrNCpyOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getParamType(2)->isIntegerTy()) - return 0; - - Value *Dst = CI->getArgOperand(0); - Value *Src = CI->getArgOperand(1); - Value *LenOp = CI->getArgOperand(2); - - // See if we can get the length of the input string. - uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) return 0; - --SrcLen; - - if (SrcLen == 0) { - // strncpy(x, "", y) -> memset(x, '\0', y, 1) - B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1); - return Dst; - } - - uint64_t Len; - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp)) - Len = LengthArg->getZExtValue(); - else - return 0; - - if (Len == 0) return Dst; // strncpy(x, y, 0) -> x - - // These optimizations require TargetData. - if (!TD) return 0; - - // Let strncpy handle the zero padding - if (Len > SrcLen+1) return 0; - - // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] - B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); - - return Dst; - } -}; - -//===---------------------------------------===// -// 'strlen' Optimizations - -struct StrLenOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 1 || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getReturnType()->isIntegerTy()) - return 0; - - Value *Src = CI->getArgOperand(0); - - // Constant folding: strlen("xyz") -> 3 - if (uint64_t Len = GetStringLength(Src)) - return ConstantInt::get(CI->getType(), Len-1); - - // strlen(x) != 0 --> *x != 0 - // strlen(x) == 0 --> *x == 0 - if (IsOnlyUsedInZeroEqualityComparison(CI)) - return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType()); - return 0; - } -}; - - -//===---------------------------------------===// -// 'strpbrk' Optimizations - -struct StrPBrkOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getParamType(0) != B.getInt8PtrTy() || - FT->getParamType(1) != FT->getParamType(0) || - FT->getReturnType() != FT->getParamType(0)) - return 0; - - StringRef S1, S2; - bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); - bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); - - // strpbrk(s, "") -> NULL - // strpbrk("", s) -> NULL - if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) - return Constant::getNullValue(CI->getType()); - - // Constant folding. - if (HasS1 && HasS2) { - size_t I = S1.find_first_of(S2); - if (I == std::string::npos) // No match. - return Constant::getNullValue(CI->getType()); - - return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); - } - - // strpbrk(s, "a") -> strchr(s, 'a') - if (TD && HasS2 && S2.size() == 1) - return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI); - - return 0; - } -}; - -//===---------------------------------------===// -// 'strto*' Optimizations. This handles strtol, strtod, strtof, strtoul, etc. - -struct StrToOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy()) - return 0; - - Value *EndPtr = CI->getArgOperand(1); - if (isa<ConstantPointerNull>(EndPtr)) { - // With a null EndPtr, this function won't capture the main argument. - // It would be readonly too, except that it still may write to errno. - CI->addAttribute(1, Attribute::NoCapture); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'strspn' Optimizations - -struct StrSpnOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getParamType(0) != B.getInt8PtrTy() || - FT->getParamType(1) != FT->getParamType(0) || - !FT->getReturnType()->isIntegerTy()) - return 0; - - StringRef S1, S2; - bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); - bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); - - // strspn(s, "") -> 0 - // strspn("", s) -> 0 - if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) - return Constant::getNullValue(CI->getType()); - - // Constant folding. - if (HasS1 && HasS2) { - size_t Pos = S1.find_first_not_of(S2); - if (Pos == StringRef::npos) Pos = S1.size(); - return ConstantInt::get(CI->getType(), Pos); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'strcspn' Optimizations - -struct StrCSpnOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getParamType(0) != B.getInt8PtrTy() || - FT->getParamType(1) != FT->getParamType(0) || - !FT->getReturnType()->isIntegerTy()) - return 0; - - StringRef S1, S2; - bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); - bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); - - // strcspn("", s) -> 0 - if (HasS1 && S1.empty()) - return Constant::getNullValue(CI->getType()); - - // Constant folding. - if (HasS1 && HasS2) { - size_t Pos = S1.find_first_of(S2); - if (Pos == StringRef::npos) Pos = S1.size(); - return ConstantInt::get(CI->getType(), Pos); - } - - // strcspn(s, "") -> strlen(s) - if (TD && HasS2 && S2.empty()) - return EmitStrLen(CI->getArgOperand(0), B, TD, TLI); - - return 0; - } -}; - -//===---------------------------------------===// -// 'strstr' Optimizations - -struct StrStrOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isPointerTy()) - return 0; - - // fold strstr(x, x) -> x. - if (CI->getArgOperand(0) == CI->getArgOperand(1)) - return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); - - // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 - if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { - Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI); - if (!StrLen) - return 0; - Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), - StrLen, B, TD, TLI); - if (!StrNCmp) - return 0; - for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end(); - UI != UE; ) { - ICmpInst *Old = cast<ICmpInst>(*UI++); - Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp, - ConstantInt::getNullValue(StrNCmp->getType()), - "cmp"); - Old->replaceAllUsesWith(Cmp); - Old->eraseFromParent(); - } - return CI; - } - - // See if either input string is a constant string. - StringRef SearchStr, ToFindStr; - bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr); - bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr); - - // fold strstr(x, "") -> x. - if (HasStr2 && ToFindStr.empty()) - return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); - - // If both strings are known, constant fold it. - if (HasStr1 && HasStr2) { - std::string::size_type Offset = SearchStr.find(ToFindStr); - - if (Offset == StringRef::npos) // strstr("foo", "bar") -> null - return Constant::getNullValue(CI->getType()); - - // strstr("abcd", "bc") -> gep((char*)"abcd", 1) - Value *Result = CastToCStr(CI->getArgOperand(0), B); - Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr"); - return B.CreateBitCast(Result, CI->getType()); - } - - // fold strstr(x, "y") -> strchr(x, 'y'). - if (HasStr2 && ToFindStr.size() == 1) { - Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI); - return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0; - } - return 0; - } -}; - - -//===---------------------------------------===// -// 'memcmp' Optimizations - -struct MemCmpOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isIntegerTy(32)) - return 0; - - Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); - - if (LHS == RHS) // memcmp(s,s,x) -> 0 - return Constant::getNullValue(CI->getType()); - - // Make sure we have a constant length. - ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); - if (!LenC) return 0; - uint64_t Len = LenC->getZExtValue(); - - if (Len == 0) // memcmp(s1,s2,0) -> 0 - return Constant::getNullValue(CI->getType()); - - // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS - if (Len == 1) { - Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"), - CI->getType(), "lhsv"); - Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"), - CI->getType(), "rhsv"); - return B.CreateSub(LHSV, RHSV, "chardiff"); - } - - // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) - StringRef LHSStr, RHSStr; - if (getConstantStringInfo(LHS, LHSStr) && - getConstantStringInfo(RHS, RHSStr)) { - // Make sure we're not reading out-of-bounds memory. - if (Len > LHSStr.size() || Len > RHSStr.size()) - return 0; - uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len); - return ConstantInt::get(CI->getType(), Ret); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'memcpy' Optimizations - -struct MemCpyOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; - - // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) - B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } -}; - -//===---------------------------------------===// -// 'memmove' Optimizations - -struct MemMoveOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; - - // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) - B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } -}; - -//===---------------------------------------===// -// 'memset' Optimizations - -struct MemSetOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; - - // memset(p, v, n) -> llvm.memset(p, v, n, 1) - Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); - B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } -}; - -//===----------------------------------------------------------------------===// -// Math Library Optimizations -//===----------------------------------------------------------------------===// - -//===---------------------------------------===// -// Double -> Float Shrinking Optimizations for Unary Functions like 'floor' - -struct UnaryDoubleFPOpt : public LibCallOptimization { - bool CheckRetType; - UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {} - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || - !FT->getParamType(0)->isDoubleTy()) - return 0; - - if (CheckRetType) { - // Check if all the uses for function like 'sin' are converted to float. - for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end(); - ++UseI) { - FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI); - if (Cast == 0 || !Cast->getType()->isFloatTy()) - return 0; - } - } - - // If this is something like 'floor((double)floatval)', convert to floorf. - FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0)); - if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy()) - return 0; - - // floor((double)floatval) -> (double)floorf(floatval) - Value *V = Cast->getOperand(0); - V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes()); - return B.CreateFPExt(V, B.getDoubleTy()); - } -}; - -//===---------------------------------------===// -// 'cos*' Optimizations -struct CosOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - Value *Ret = NULL; - if (UnsafeFPShrink && Callee->getName() == "cos" && - TLI->has(LibFunc::cosf)) { - UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); - Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B); - } - - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 1 argument of FP type, which matches the - // result type. - if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isFloatingPointTy()) - return Ret; - - // cos(-x) -> cos(x) - Value *Op1 = CI->getArgOperand(0); - if (BinaryOperator::isFNeg(Op1)) { - BinaryOperator *BinExpr = cast<BinaryOperator>(Op1); - return B.CreateCall(Callee, BinExpr->getOperand(1), "cos"); - } - return Ret; - } -}; - -//===---------------------------------------===// -// 'pow*' Optimizations - -struct PowOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - Value *Ret = NULL; - if (UnsafeFPShrink && Callee->getName() == "pow" && - TLI->has(LibFunc::powf)) { - UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); - Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B); - } - - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 2 arguments of the same FP type, which match the - // result type. - if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - !FT->getParamType(0)->isFloatingPointTy()) - return Ret; - - Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1); - if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { - if (Op1C->isExactlyValue(1.0)) // pow(1.0, x) -> 1.0 - return Op1C; - if (Op1C->isExactlyValue(2.0)) // pow(2.0, x) -> exp2(x) - return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); - } - - ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2); - if (Op2C == 0) return Ret; - - if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 - return ConstantFP::get(CI->getType(), 1.0); - - if (Op2C->isExactlyValue(0.5)) { - // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). - // This is faster than calling pow, and still handles negative zero - // and negative infinity correctly. - // TODO: In fast-math mode, this could be just sqrt(x). - // TODO: In finite-only mode, this could be just fabs(sqrt(x)). - Value *Inf = ConstantFP::getInfinity(CI->getType()); - Value *NegInf = ConstantFP::getInfinity(CI->getType(), true); - Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B, - Callee->getAttributes()); - Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B, - Callee->getAttributes()); - Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf); - Value *Sel = B.CreateSelect(FCmp, Inf, FAbs); - return Sel; - } - - if (Op2C->isExactlyValue(1.0)) // pow(x, 1.0) -> x - return Op1; - if (Op2C->isExactlyValue(2.0)) // pow(x, 2.0) -> x*x - return B.CreateFMul(Op1, Op1, "pow2"); - if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x - return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), - Op1, "powrecip"); - return 0; - } -}; - -//===---------------------------------------===// -// 'exp2' Optimizations - -struct Exp2Opt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - Value *Ret = NULL; - if (UnsafeFPShrink && Callee->getName() == "exp2" && - TLI->has(LibFunc::exp2)) { - UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); - Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B); - } - - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 1 argument of FP type, which matches the - // result type. - if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isFloatingPointTy()) - return Ret; - - Value *Op = CI->getArgOperand(0); - // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32 - // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32 - Value *LdExpArg = 0; - if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { - if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) - LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); - } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { - if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) - LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); - } - - if (LdExpArg) { - const char *Name; - if (Op->getType()->isFloatTy()) - Name = "ldexpf"; - else if (Op->getType()->isDoubleTy()) - Name = "ldexp"; - else - Name = "ldexpl"; - - Constant *One = ConstantFP::get(*Context, APFloat(1.0f)); - if (!Op->getType()->isFloatTy()) - One = ConstantExpr::getFPExtend(One, Op->getType()); - - Module *M = Caller->getParent(); - Value *Callee = M->getOrInsertFunction(Name, Op->getType(), - Op->getType(), - B.getInt32Ty(), NULL); - CallInst *CI = B.CreateCall2(Callee, One, LdExpArg); - if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) - CI->setCallingConv(F->getCallingConv()); - - return CI; - } - return Ret; - } -}; - -//===----------------------------------------------------------------------===// -// Integer Optimizations -//===----------------------------------------------------------------------===// - -//===---------------------------------------===// -// 'ffs*' Optimizations - -struct FFSOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 2 arguments of the same FP type, which match the - // result type. - if (FT->getNumParams() != 1 || - !FT->getReturnType()->isIntegerTy(32) || - !FT->getParamType(0)->isIntegerTy()) - return 0; - - Value *Op = CI->getArgOperand(0); - - // Constant fold. - if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { - if (CI->getValue() == 0) // ffs(0) -> 0. - return Constant::getNullValue(CI->getType()); - // ffs(c) -> cttz(c)+1 - return B.getInt32(CI->getValue().countTrailingZeros() + 1); - } - - // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0 - Type *ArgType = Op->getType(); - Value *F = Intrinsic::getDeclaration(Callee->getParent(), - Intrinsic::cttz, ArgType); - Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz"); - V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); - V = B.CreateIntCast(V, B.getInt32Ty(), false); - - Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType)); - return B.CreateSelect(Cond, V, B.getInt32(0)); - } -}; - -//===---------------------------------------===// -// 'isdigit' Optimizations - -struct IsDigitOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // We require integer(i32) - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - !FT->getParamType(0)->isIntegerTy(32)) - return 0; - - // isdigit(c) -> (c-'0') <u 10 - Value *Op = CI->getArgOperand(0); - Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp"); - Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit"); - return B.CreateZExt(Op, CI->getType()); - } -}; - -//===---------------------------------------===// -// 'isascii' Optimizations - -struct IsAsciiOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // We require integer(i32) - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - !FT->getParamType(0)->isIntegerTy(32)) - return 0; - - // isascii(c) -> c <u 128 - Value *Op = CI->getArgOperand(0); - Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii"); - return B.CreateZExt(Op, CI->getType()); - } -}; - -//===---------------------------------------===// -// 'abs', 'labs', 'llabs' Optimizations - -struct AbsOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // We require integer(integer) where the types agree. - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - FT->getParamType(0) != FT->getReturnType()) - return 0; - - // abs(x) -> x >s -1 ? x : -x - Value *Op = CI->getArgOperand(0); - Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), - "ispos"); - Value *Neg = B.CreateNeg(Op, "neg"); - return B.CreateSelect(Pos, Op, Neg); - } -}; - - -//===---------------------------------------===// -// 'toascii' Optimizations - -struct ToAsciiOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - // We require i32(i32) - if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isIntegerTy(32)) - return 0; - - // isascii(c) -> c & 0x7f - return B.CreateAnd(CI->getArgOperand(0), - ConstantInt::get(CI->getType(),0x7F)); - } -}; - -//===----------------------------------------------------------------------===// -// Formatting and IO Optimizations -//===----------------------------------------------------------------------===// - -//===---------------------------------------===// -// 'printf' Optimizations - -struct PrintFOpt : public LibCallOptimization { - Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, - IRBuilder<> &B) { - // Check for a fixed format string. - StringRef FormatStr; - if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr)) - return 0; - - // Empty format string -> noop. - if (FormatStr.empty()) // Tolerate printf's declared void. - return CI->use_empty() ? (Value*)CI : - ConstantInt::get(CI->getType(), 0); - - // Do not do any of the following transformations if the printf return value - // is used, in general the printf return value is not compatible with either - // putchar() or puts(). - if (!CI->use_empty()) - return 0; - - // printf("x") -> putchar('x'), even for '%'. - if (FormatStr.size() == 1) { - Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI); - if (CI->use_empty() || !Res) return Res; - return B.CreateIntCast(Res, CI->getType(), true); - } - - // printf("foo\n") --> puts("foo") - if (FormatStr[FormatStr.size()-1] == '\n' && - FormatStr.find('%') == std::string::npos) { // no format characters. - // Create a string literal with no \n on it. We expect the constant merge - // pass to be run after this pass, to merge duplicate strings. - FormatStr = FormatStr.drop_back(); - Value *GV = B.CreateGlobalString(FormatStr, "str"); - Value *NewCI = EmitPutS(GV, B, TD, TLI); - return (CI->use_empty() || !NewCI) ? - NewCI : - ConstantInt::get(CI->getType(), FormatStr.size()+1); - } - - // Optimize specific format strings. - // printf("%c", chr) --> putchar(chr) - if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && - CI->getArgOperand(1)->getType()->isIntegerTy()) { - Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI); - - if (CI->use_empty() || !Res) return Res; - return B.CreateIntCast(Res, CI->getType(), true); - } - - // printf("%s\n", str) --> puts(str) - if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && - CI->getArgOperand(1)->getType()->isPointerTy()) { - return EmitPutS(CI->getArgOperand(1), B, TD, TLI); - } - return 0; - } - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require one fixed pointer argument and an integer/void result. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || - !(FT->getReturnType()->isIntegerTy() || - FT->getReturnType()->isVoidTy())) - return 0; - - if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { - return V; - } - - // printf(format, ...) -> iprintf(format, ...) if no floating point - // arguments. - if (TLI->has(LibFunc::iprintf) && !CallHasFloatingPointArgument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - Constant *IPrintFFn = - M->getOrInsertFunction("iprintf", FT, Callee->getAttributes()); - CallInst *New = cast<CallInst>(CI->clone()); - New->setCalledFunction(IPrintFFn); - B.Insert(New); - return New; - } - return 0; - } -}; - -//===---------------------------------------===// -// 'sprintf' Optimizations - -struct SPrintFOpt : public LibCallOptimization { - Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, - IRBuilder<> &B) { - // Check for a fixed format string. - StringRef FormatStr; - if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) - return 0; - - // If we just have a format string (nothing else crazy) transform it. - if (CI->getNumArgOperands() == 2) { - // Make sure there's no % in the constant array. We could try to handle - // %% -> % in the future if we cared. - for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) - if (FormatStr[i] == '%') - return 0; // we found a format specifier, bail out. - - // These optimizations require TargetData. - if (!TD) return 0; - - // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) - B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), - ConstantInt::get(TD->getIntPtrType(*Context), // Copy the - FormatStr.size() + 1), 1); // nul byte. - return ConstantInt::get(CI->getType(), FormatStr.size()); - } - - // The remaining optimizations require the format string to be "%s" or "%c" - // and have an extra operand. - if (FormatStr.size() != 2 || FormatStr[0] != '%' || - CI->getNumArgOperands() < 3) - return 0; - - // Decode the second character of the format string. - if (FormatStr[1] == 'c') { - // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 - if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; - Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); - Value *Ptr = CastToCStr(CI->getArgOperand(0), B); - B.CreateStore(V, Ptr); - Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul"); - B.CreateStore(B.getInt8(0), Ptr); - - return ConstantInt::get(CI->getType(), 1); - } - - if (FormatStr[1] == 's') { - // These optimizations require TargetData. - if (!TD) return 0; - - // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) - if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0; - - Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI); - if (!Len) - return 0; - Value *IncLen = B.CreateAdd(Len, - ConstantInt::get(Len->getType(), 1), - "leninc"); - B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1); - - // The sprintf result is the unincremented number of bytes in the string. - return B.CreateIntCast(Len, CI->getType(), false); - } - return 0; - } - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require two fixed pointer arguments and an integer result. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isIntegerTy()) - return 0; - - if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { - return V; - } - - // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating - // point arguments. - if (TLI->has(LibFunc::siprintf) && !CallHasFloatingPointArgument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - Constant *SIPrintFFn = - M->getOrInsertFunction("siprintf", FT, Callee->getAttributes()); - CallInst *New = cast<CallInst>(CI->clone()); - New->setCalledFunction(SIPrintFFn); - B.Insert(New); - return New; - } - return 0; - } -}; - -//===---------------------------------------===// -// 'fwrite' Optimizations - -struct FWriteOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require a pointer, an integer, an integer, a pointer, returning integer. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isIntegerTy() || - !FT->getParamType(2)->isIntegerTy() || - !FT->getParamType(3)->isPointerTy() || - !FT->getReturnType()->isIntegerTy()) - return 0; - - // Get the element size and count. - ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); - if (!SizeC || !CountC) return 0; - uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue(); - - // If this is writing zero records, remove the call (it's a noop). - if (Bytes == 0) - return ConstantInt::get(CI->getType(), 0); - - // If this is writing one byte, turn it into fputc. - // This optimisation is only valid, if the return value is unused. - if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F) - Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char"); - Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI); - return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'fputs' Optimizations - -struct FPutsOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - // Require two pointers. Also, we can't optimize if return value is used. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !CI->use_empty()) - return 0; - - // fputs(s,F) --> fwrite(s,1,strlen(s),F) - uint64_t Len = GetStringLength(CI->getArgOperand(0)); - if (!Len) return 0; - // Known to have no uses (see above). - return EmitFWrite(CI->getArgOperand(0), - ConstantInt::get(TD->getIntPtrType(*Context), Len-1), - CI->getArgOperand(1), B, TD, TLI); - } -}; - -//===---------------------------------------===// -// 'fprintf' Optimizations - -struct FPrintFOpt : public LibCallOptimization { - Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, - IRBuilder<> &B) { - // All the optimizations depend on the format string. - StringRef FormatStr; - if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) - return 0; - - // fprintf(F, "foo") --> fwrite("foo", 3, 1, F) - if (CI->getNumArgOperands() == 2) { - for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) - if (FormatStr[i] == '%') // Could handle %% -> % if we cared. - return 0; // We found a format specifier. - - // These optimizations require TargetData. - if (!TD) return 0; - - Value *NewCI = EmitFWrite(CI->getArgOperand(1), - ConstantInt::get(TD->getIntPtrType(*Context), - FormatStr.size()), - CI->getArgOperand(0), B, TD, TLI); - return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0; - } - - // The remaining optimizations require the format string to be "%s" or "%c" - // and have an extra operand. - if (FormatStr.size() != 2 || FormatStr[0] != '%' || - CI->getNumArgOperands() < 3) - return 0; - - // Decode the second character of the format string. - if (FormatStr[1] == 'c') { - // fprintf(F, "%c", chr) --> fputc(chr, F) - if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; - Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, - TD, TLI); - return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; - } - - if (FormatStr[1] == 's') { - // fprintf(F, "%s", str) --> fputs(str, F) - if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty()) - return 0; - return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI); - } - return 0; - } - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require two fixed paramters as pointers and integer result. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isIntegerTy()) - return 0; - - if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { - return V; - } - - // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no - // floating point arguments. - if (TLI->has(LibFunc::fiprintf) && !CallHasFloatingPointArgument(CI)) { - Module *M = B.GetInsertBlock()->getParent()->getParent(); - Constant *FIPrintFFn = - M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes()); - CallInst *New = cast<CallInst>(CI->clone()); - New->setCalledFunction(FIPrintFFn); - B.Insert(New); - return New; - } - return 0; - } -}; - -//===---------------------------------------===// -// 'puts' Optimizations - -struct PutsOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Require one fixed pointer argument and an integer/void result. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || - !(FT->getReturnType()->isIntegerTy() || - FT->getReturnType()->isVoidTy())) - return 0; - - // Check for a constant string. - StringRef Str; - if (!getConstantStringInfo(CI->getArgOperand(0), Str)) - return 0; - - if (Str.empty() && CI->use_empty()) { - // puts("") -> putchar('\n') - Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI); - if (CI->use_empty() || !Res) return Res; - return B.CreateIntCast(Res, CI->getType(), true); - } - - return 0; - } -}; - -} // end anonymous namespace. - -//===----------------------------------------------------------------------===// // SimplifyLibCalls Pass Implementation //===----------------------------------------------------------------------===// @@ -1561,32 +91,11 @@ namespace { TargetLibraryInfo *TLI; StringMap<LibCallOptimization*> Optimizations; - // String and Memory LibCall Optimizations - StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr; - StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; - StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; - StpCpyOpt StpCpy; StpCpyOpt StpCpyChk; - StrNCpyOpt StrNCpy; - StrLenOpt StrLen; StrPBrkOpt StrPBrk; - StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr; - MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet; - // Math Library Optimizations - CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; - UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP; - // Integer Optimizations - FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii; - ToAsciiOpt ToAscii; - // Formatting and IO Optimizations - SPrintFOpt SPrintF; PrintFOpt PrintF; - FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF; - PutsOpt Puts; bool Modified; // This is only used by doInitialization. public: static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true), - StpCpy(false), StpCpyChk(true), - UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true) { + SimplifyLibCalls() : FunctionPass(ID) { initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } void AddOpt(LibFunc::Func F, LibCallOptimization* Opt); @@ -1636,108 +145,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2, /// Optimizations - Populate the Optimizations map with all the optimizations /// we know. void SimplifyLibCalls::InitOptimizations() { - // String and Memory LibCall Optimizations - Optimizations["strcat"] = &StrCat; - Optimizations["strncat"] = &StrNCat; - Optimizations["strchr"] = &StrChr; - Optimizations["strrchr"] = &StrRChr; - Optimizations["strcmp"] = &StrCmp; - Optimizations["strncmp"] = &StrNCmp; - Optimizations["strcpy"] = &StrCpy; - Optimizations["strncpy"] = &StrNCpy; - Optimizations["stpcpy"] = &StpCpy; - Optimizations["strlen"] = &StrLen; - Optimizations["strpbrk"] = &StrPBrk; - Optimizations["strtol"] = &StrTo; - Optimizations["strtod"] = &StrTo; - Optimizations["strtof"] = &StrTo; - Optimizations["strtoul"] = &StrTo; - Optimizations["strtoll"] = &StrTo; - Optimizations["strtold"] = &StrTo; - Optimizations["strtoull"] = &StrTo; - Optimizations["strspn"] = &StrSpn; - Optimizations["strcspn"] = &StrCSpn; - Optimizations["strstr"] = &StrStr; - Optimizations["memcmp"] = &MemCmp; - AddOpt(LibFunc::memcpy, &MemCpy); - Optimizations["memmove"] = &MemMove; - AddOpt(LibFunc::memset, &MemSet); - - // _chk variants of String and Memory LibCall Optimizations. - Optimizations["__strcpy_chk"] = &StrCpyChk; - Optimizations["__stpcpy_chk"] = &StpCpyChk; - - // Math Library Optimizations - Optimizations["cosf"] = &Cos; - Optimizations["cos"] = &Cos; - Optimizations["cosl"] = &Cos; - Optimizations["powf"] = &Pow; - Optimizations["pow"] = &Pow; - Optimizations["powl"] = &Pow; - Optimizations["llvm.pow.f32"] = &Pow; - Optimizations["llvm.pow.f64"] = &Pow; - Optimizations["llvm.pow.f80"] = &Pow; - Optimizations["llvm.pow.f128"] = &Pow; - Optimizations["llvm.pow.ppcf128"] = &Pow; - Optimizations["exp2l"] = &Exp2; - Optimizations["exp2"] = &Exp2; - Optimizations["exp2f"] = &Exp2; - Optimizations["llvm.exp2.ppcf128"] = &Exp2; - Optimizations["llvm.exp2.f128"] = &Exp2; - Optimizations["llvm.exp2.f80"] = &Exp2; - Optimizations["llvm.exp2.f64"] = &Exp2; - Optimizations["llvm.exp2.f32"] = &Exp2; - - AddOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP); - AddOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP); - AddOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP); - AddOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP); - AddOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP); - AddOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP); - AddOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP); - - if(UnsafeFPShrink) { - AddOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP); - AddOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP); - } - - // Integer Optimizations - Optimizations["ffs"] = &FFS; - Optimizations["ffsl"] = &FFS; - Optimizations["ffsll"] = &FFS; - Optimizations["abs"] = &Abs; - Optimizations["labs"] = &Abs; - Optimizations["llabs"] = &Abs; - Optimizations["isdigit"] = &IsDigit; - Optimizations["isascii"] = &IsAscii; - Optimizations["toascii"] = &ToAscii; - - // Formatting and IO Optimizations - Optimizations["sprintf"] = &SPrintF; - Optimizations["printf"] = &PrintF; - AddOpt(LibFunc::fwrite, &FWrite); - AddOpt(LibFunc::fputs, &FPuts); - Optimizations["fprintf"] = &FPrintF; - Optimizations["puts"] = &Puts; } @@ -1749,7 +156,7 @@ bool SimplifyLibCalls::runOnFunction(Function &F) { if (Optimizations.empty()) InitOptimizations(); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); IRBuilder<> Builder(F.getContext()); @@ -1785,7 +192,6 @@ bool SimplifyLibCalls::runOnFunction(Function &F) { // Something changed! Changed = true; - ++NumSimplified; // Inspect the instruction after the call (which was potentially just // added) next. diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 34f1d6c..d4595bb 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -14,13 +14,13 @@ #define DEBUG_TYPE "sink" #include "llvm/Transforms/Scalar.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Assembly/Writer.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 6557d63..6572e09 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -52,25 +52,25 @@ #define DEBUG_TYPE "tailcallelim" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Module.h" -#include "llvm/Pass.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" -#include "llvm/Support/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CFG.h" +#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; STATISTIC(NumEliminated, "Number of tail calls removed"); |
