diff options
Diffstat (limited to 'lib/Transforms')
36 files changed, 2547 insertions, 904 deletions
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp index b63495b..a7bf188 100644 --- a/lib/Transforms/IPO/ConstantMerge.cpp +++ b/lib/Transforms/IPO/ConstantMerge.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" #include "llvm/Pass.h" using namespace llvm; @@ -68,10 +69,11 @@ static void FindUsedValues(GlobalVariable *LLVMUsed, if (LLVMUsed == 0) return; ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); - for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) - if (GlobalValue *GV = - dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts())) - UsedValues.insert(GV); + for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) { + Value *Operand = Inits->getOperand(i)->stripPointerCastsNoFollowAliases(); + GlobalValue *GV = cast<GlobalValue>(Operand); + UsedValues.insert(GV); + } } // True if A is better than B. diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 49ef1e7..3fdb5f0 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -343,8 +343,9 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) if (Fn.isDeclaration() || Fn.mayBeOverridden()) return false; - // Functions with local linkage should already have been handled. - if (Fn.hasLocalLinkage()) + // Functions with local linkage should already have been handled, except the + // fragile (variadic) ones which we can improve here. + if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg()) return false; if (Fn.use_empty()) @@ -604,9 +605,20 @@ void DAE::SurveyFunction(const Function &F) { UseVector MaybeLiveArgUses; for (Function::const_arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; ++AI, ++i) { - // See what the effect of this use is (recording any uses that cause - // MaybeLive in MaybeLiveArgUses). - Liveness Result = SurveyUses(AI, MaybeLiveArgUses); + Liveness Result; + if (F.getFunctionType()->isVarArg()) { + // Variadic functions will already have a va_arg function expanded inside + // them, making them potentially very sensitive to ABI changes resulting + // from removing arguments entirely, so don't. For example AArch64 handles + // register and stack HFAs very differently, and this is reflected in the + // IR which has already been generated. + Result = Live; + } else { + // See what the effect of this use is (recording any uses that cause + // MaybeLive in MaybeLiveArgUses). + Result = SurveyUses(AI, MaybeLiveArgUses); + } + // Mark the result. MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses); // Clear the vector again for the next iteration. diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index b035a82..a4de71b 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -3041,8 +3041,168 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) { return true; } +/// \brief Given "llvm.used" or "llvm.compiler_used" as a global name, collect +/// the initializer elements of that global in Set and return the global itself. +static GlobalVariable * +collectUsedGlobalVariables(const Module &M, const char *Name, + SmallPtrSet<GlobalValue *, 8> &Set) { + GlobalVariable *GV = M.getGlobalVariable(Name); + if (!GV || !GV->hasInitializer()) + return GV; + + const ConstantArray *Init = cast<ConstantArray>(GV->getInitializer()); + for (unsigned I = 0, E = Init->getNumOperands(); I != E; ++I) { + Value *Op = Init->getOperand(I); + GlobalValue *G = cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases()); + Set.insert(G); + } + return GV; +} + +static int compareNames(const void *A, const void *B) { + const GlobalValue *VA = *reinterpret_cast<GlobalValue* const*>(A); + const GlobalValue *VB = *reinterpret_cast<GlobalValue* const*>(B); + if (VA->getName() < VB->getName()) + return -1; + if (VB->getName() < VA->getName()) + return 1; + return 0; +} + +static void setUsedInitializer(GlobalVariable &V, + SmallPtrSet<GlobalValue *, 8> Init) { + SmallVector<llvm::Constant *, 8> UsedArray; + PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext()); + + for (SmallPtrSet<GlobalValue *, 8>::iterator I = Init.begin(), E = Init.end(); + I != E; ++I) { + Constant *Cast = llvm::ConstantExpr::getBitCast(*I, Int8PtrTy); + UsedArray.push_back(Cast); + } + // Sort to get deterministic order. + array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames); + ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size()); + + Module *M = V.getParent(); + V.removeFromParent(); + GlobalVariable *NV = + new GlobalVariable(*M, ATy, false, llvm::GlobalValue::AppendingLinkage, + llvm::ConstantArray::get(ATy, UsedArray), ""); + NV->takeName(&V); + NV->setSection("llvm.metadata"); + delete &V; +} + +namespace { +/// \brief An easy to access representation of llvm.used and llvm.compiler_used. +class LLVMUsed { + SmallPtrSet<GlobalValue *, 8> Used; + SmallPtrSet<GlobalValue *, 8> CompilerUsed; + GlobalVariable *UsedV; + GlobalVariable *CompilerUsedV; + +public: + LLVMUsed(const Module &M) { + UsedV = collectUsedGlobalVariables(M, "llvm.used", Used); + CompilerUsedV = + collectUsedGlobalVariables(M, "llvm.compiler_used", CompilerUsed); + } + typedef SmallPtrSet<GlobalValue *, 8>::iterator iterator; + iterator usedBegin() { return Used.begin(); } + iterator usedEnd() { return Used.end(); } + iterator compilerUsedBegin() { return CompilerUsed.begin(); } + iterator compilerUsedEnd() { return CompilerUsed.end(); } + bool usedCount(GlobalValue *GV) const { return Used.count(GV); } + bool compilerUsedCount(GlobalValue *GV) const { + return CompilerUsed.count(GV); + } + bool usedErase(GlobalValue *GV) { return Used.erase(GV); } + bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); } + bool usedInsert(GlobalValue *GV) { return Used.insert(GV); } + bool compilerUsedInsert(GlobalValue *GV) { return CompilerUsed.insert(GV); } + + void syncVariablesAndSets() { + if (UsedV) + setUsedInitializer(*UsedV, Used); + if (CompilerUsedV) + setUsedInitializer(*CompilerUsedV, CompilerUsed); + } +}; +} + +static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) { + if (GA.use_empty()) // No use at all. + return false; + + assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) && + "We should have removed the duplicated " + "element from llvm.compiler_used"); + if (!GA.hasOneUse()) + // Strictly more than one use. So at least one is not in llvm.used and + // llvm.compiler_used. + return true; + + // Exactly one use. Check if it is in llvm.used or llvm.compiler_used. + return !U.usedCount(&GA) && !U.compilerUsedCount(&GA); +} + +static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V, + const LLVMUsed &U) { + unsigned N = 2; + assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) && + "We should have removed the duplicated " + "element from llvm.compiler_used"); + if (U.usedCount(&V) || U.compilerUsedCount(&V)) + ++N; + return V.hasNUsesOrMore(N); +} + +static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) { + if (!GA.hasLocalLinkage()) + return true; + + return U.usedCount(&GA) || U.compilerUsedCount(&GA); +} + +static bool hasUsesToReplace(GlobalAlias &GA, LLVMUsed &U, bool &RenameTarget) { + RenameTarget = false; + bool Ret = false; + if (hasUseOtherThanLLVMUsed(GA, U)) + Ret = true; + + // If the alias is externally visible, we may still be able to simplify it. + if (!mayHaveOtherReferences(GA, U)) + return Ret; + + // If the aliasee has internal linkage, give it the name and linkage + // of the alias, and delete the alias. This turns: + // define internal ... @f(...) + // @a = alias ... @f + // into: + // define ... @a(...) + Constant *Aliasee = GA.getAliasee(); + GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts()); + if (!Target->hasLocalLinkage()) + return Ret; + + // Do not perform the transform if multiple aliases potentially target the + // aliasee. This check also ensures that it is safe to replace the section + // and other attributes of the aliasee with those of the alias. + if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U)) + return Ret; + + RenameTarget = true; + return true; +} + bool GlobalOpt::OptimizeGlobalAliases(Module &M) { bool Changed = false; + LLVMUsed Used(M); + + for (SmallPtrSet<GlobalValue *, 8>::iterator I = Used.usedBegin(), + E = Used.usedEnd(); + I != E; ++I) + Used.compilerUsedErase(*I); for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;) { @@ -3057,37 +3217,29 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { Constant *Aliasee = J->getAliasee(); GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts()); Target->removeDeadConstantUsers(); - bool hasOneUse = Target->hasOneUse() && Aliasee->hasOneUse(); // Make all users of the alias use the aliasee instead. - if (!J->use_empty()) { - J->replaceAllUsesWith(Aliasee); - ++NumAliasesResolved; - Changed = true; - } - - // If the alias is externally visible, we may still be able to simplify it. - if (!J->hasLocalLinkage()) { - // If the aliasee has internal linkage, give it the name and linkage - // of the alias, and delete the alias. This turns: - // define internal ... @f(...) - // @a = alias ... @f - // into: - // define ... @a(...) - if (!Target->hasLocalLinkage()) - continue; + bool RenameTarget; + if (!hasUsesToReplace(*J, Used, RenameTarget)) + continue; - // Do not perform the transform if multiple aliases potentially target the - // aliasee. This check also ensures that it is safe to replace the section - // and other attributes of the aliasee with those of the alias. - if (!hasOneUse) - continue; + J->replaceAllUsesWith(Aliasee); + ++NumAliasesResolved; + Changed = true; + if (RenameTarget) { // Give the aliasee the name, linkage and other attributes of the alias. Target->takeName(J); Target->setLinkage(J->getLinkage()); Target->GlobalValue::copyAttributesFrom(J); - } + + if (Used.usedErase(J)) + Used.usedInsert(Target); + + if (Used.compilerUsedErase(J)) + Used.compilerUsedInsert(Target); + } else if (mayHaveOtherReferences(*J, Used)) + continue; // Delete the alias. M.getAliasList().erase(J); @@ -3095,6 +3247,8 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { Changed = true; } + Used.syncVariablesAndSets(); + return Changed; } @@ -3223,8 +3377,6 @@ bool GlobalOpt::runOnModule(Module &M) { // Try to find the llvm.globalctors list. GlobalVariable *GlobalCtors = FindGlobalCtors(M); - Function *CXAAtExitFn = FindCXAAtExit(M, TLI); - bool LocalChange = true; while (LocalChange) { LocalChange = false; @@ -3242,7 +3394,9 @@ bool GlobalOpt::runOnModule(Module &M) { // Resolve aliases, when possible. LocalChange |= OptimizeGlobalAliases(M); - // Try to remove trivial global destructors. + // Try to remove trivial global destructors if they are not removed + // already. + Function *CXAAtExitFn = FindCXAAtExit(M, TLI); if (CXAAtExitFn) LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn); diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 986c0b8..8ed7704 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -32,6 +32,12 @@ static cl::opt<bool> RunLoopVectorization("vectorize-loops", cl::desc("Run the Loop vectorization passes")); +// This is a helper flag that we use for testing the profitability of +// vectorization on -O2 and -Os. It should go away once we make a decision. +static cl::opt<bool> +VectorizeO2("vectorize-o2", + cl::desc("Enable vectorization on all O levels")); + static cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::desc("Run the SLP vectorization passes")); @@ -192,7 +198,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. MPM.add(createLoopDeletionPass()); // Delete dead loops - if (LoopVectorize && OptLevel > 2) + if (LoopVectorize && (OptLevel > 2 || VectorizeO2)) MPM.add(createLoopVectorizePass()); if (!DisableUnrollLoops) diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp index 3396f79..754eff6 100644 --- a/lib/Transforms/IPO/StripSymbols.cpp +++ b/lib/Transforms/IPO/StripSymbols.cpp @@ -332,16 +332,6 @@ bool StripDebugDeclare::runOnModule(Module &M) { return true; } -/// getRealLinkageName - If special LLVM prefix that is used to inform the asm -/// printer to not emit usual symbol prefix before the symbol name is used then -/// return linkage name after skipping this special LLVM prefix. -static StringRef getRealLinkageName(StringRef LinkageName) { - char One = '\1'; - if (LinkageName.startswith(StringRef(&One, 1))) - return LinkageName.substr(1); - return LinkageName; -} - bool StripDeadDebugInfo::runOnModule(Module &M) { bool Changed = false; @@ -401,9 +391,8 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { StringRef FName = DISubprogram(*I).getLinkageName(); if (FName.empty()) FName = DISubprogram(*I).getName(); - if (NamedMDNode *LVNMD = - M.getNamedMetadata(Twine("llvm.dbg.lv.", - getRealLinkageName(FName)))) + if (NamedMDNode *LVNMD = M.getNamedMetadata( + "llvm.dbg.lv." + Function::getRealLinkageName(FName))) LVNMD->eraseFromParent(); } } diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h index 2a36074..b3084cc 100644 --- a/lib/Transforms/InstCombine/InstCombine.h +++ b/lib/Transforms/InstCombine/InstCombine.h @@ -1,4 +1,4 @@ -//===- InstCombine.h - Main InstCombine pass definition -------------------===// +//===- InstCombine.h - Main InstCombine pass definition ---------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -234,6 +234,7 @@ private: bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS); Value *EmitGEPOffset(User *GEP); Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); + Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask); public: // InsertNewInstBefore - insert an instruction New before instruction Old diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index b96eb51..a2c545f 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -974,6 +974,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI), XorLHS); } + // (X + signbit) + C could have gotten canonicalized to (X ^ signbit) + C, + // transform them into (X + (signbit ^ C)) + if (XorRHS->getValue().isSignBit()) + return BinaryOperator::CreateAdd(XorLHS, + ConstantExpr::getXor(XorRHS, CI)); } } @@ -1232,6 +1237,74 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { } } + // select C, 0, B + select C, A, 0 -> select C, A, B + { + Value *A1, *B1, *C1, *A2, *B2, *C2; + if (match(LHS, m_Select(m_Value(C1), m_Value(A1), m_Value(B1))) && + match(RHS, m_Select(m_Value(C2), m_Value(A2), m_Value(B2)))) { + if (C1 == C2) { + Constant *Z1=0, *Z2=0; + Value *A, *B, *C=C1; + if (match(A1, m_AnyZero()) && match(B2, m_AnyZero())) { + Z1 = dyn_cast<Constant>(A1); A = A2; + Z2 = dyn_cast<Constant>(B2); B = B1; + } else if (match(B1, m_AnyZero()) && match(A2, m_AnyZero())) { + Z1 = dyn_cast<Constant>(B1); B = B2; + Z2 = dyn_cast<Constant>(A2); A = A1; + } + + if (Z1 && Z2 && + (I.hasNoSignedZeros() || + (Z1->isNegativeZeroValue() && Z2->isNegativeZeroValue()))) { + return SelectInst::Create(C, A, B); + } + } + } + } + + // A * (1 - uitofp i1 C) + B * (uitofp i1 C) -> select C, B, A + { + if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) { + Value *M1L, *M1R, *M2L, *M2R; + if (match(LHS, m_FMul(m_Value(M1L), m_Value(M1R))) && + match(RHS, m_FMul(m_Value(M2L), m_Value(M2R)))) { + + Value *A, *B, *C1, *C2; + if (!match(M1R, m_FSub(m_FPOne(), m_UIToFp(m_Value(C1))))) + std::swap(M1L, M1R); + if (!match(M2R, m_UIToFp(m_Value(C2)))) + std::swap(M2L, M2R); + + if (match(M1R, m_FSub(m_FPOne(), m_UIToFp(m_Value(C1)))) && + match(M2R, m_UIToFp(m_Value(C2))) && + C2->getType()->isIntegerTy(1) && + C1 == C2) { + A = M1L; + B = M2L; + return SelectInst::Create(C1, B, A); + } + + std::swap(M1L, M2L); + std::swap(M1R, M2R); + + if (!match(M1R, m_FSub(m_FPOne(), m_UIToFp(m_Value(C1))))) + std::swap(M1L, M1R); + if (!match(M2R, m_UIToFp(m_Value(C2)))) + std::swap(M2L, M2R); + + if (match(M1R, m_FSub(m_FPOne(), m_UIToFp(m_Value(C1)))) && + match(M2R, m_UIToFp(m_Value(C2))) && + C2->getType()->isIntegerTy(1) && + C1 == C2) { + A = M1L; + B = M2L; + return SelectInst::Create(C1, B, A); + } + } + } + } + + if (I.hasUnsafeAlgebra()) { if (Value *V = FAddCombine(Builder).simplify(&I)) return ReplaceInstUsesWith(I, V); diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index ec75dd2..496fce6 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -173,14 +173,14 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, // Adding a one to a single bit bit-field should be turned into an XOR // of the bit. First thing to check is to see if this AND is with a // single bit constant. - const APInt &AndRHSV = cast<ConstantInt>(AndRHS)->getValue(); + const APInt &AndRHSV = AndRHS->getValue(); // If there is only one bit set. if (AndRHSV.isPowerOf2()) { // Ok, at this point, we know that we are masking the result of the // ADD down to exactly one bit. If the constant we are adding has // no bits set below this bit, then we can eliminate the ADD. - const APInt& AddRHS = cast<ConstantInt>(OpRHS)->getValue(); + const APInt& AddRHS = OpRHS->getValue(); // Check to see if any bits below the one bit set in AndRHSV are set. if ((AddRHS & (AndRHSV-1)) == 0) { @@ -209,8 +209,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, uint32_t BitWidth = AndRHS->getType()->getBitWidth(); uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal)); - ConstantInt *CI = ConstantInt::get(AndRHS->getContext(), - AndRHS->getValue() & ShlMask); + ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShlMask); if (CI->getValue() == ShlMask) // Masking out bits that the shift already masks. @@ -230,8 +229,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, uint32_t BitWidth = AndRHS->getType()->getBitWidth(); uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal)); - ConstantInt *CI = ConstantInt::get(Op->getContext(), - AndRHS->getValue() & ShrMask); + ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShrMask); if (CI->getValue() == ShrMask) // Masking out bits that the shift already masks. @@ -251,8 +249,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op, uint32_t BitWidth = AndRHS->getType()->getBitWidth(); uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal)); - Constant *C = ConstantInt::get(Op->getContext(), - AndRHS->getValue() & ShrMask); + Constant *C = Builder->getInt(AndRHS->getValue() & ShrMask); if (C == AndRHS) { // Masking out bits shifted in. // (Val ashr C1) & C2 -> (Val lshr C1) & C2 // Make the argument unsigned. @@ -279,7 +276,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, if (Inside) { if (Lo == Hi) // Trivially false. - return ConstantInt::getFalse(V->getContext()); + return Builder->getFalse(); // V >= Min && V < Hi --> V < Hi if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) { @@ -296,7 +293,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, } if (Lo == Hi) // Trivially true. - return ConstantInt::getTrue(V->getContext()); + return Builder->getTrue(); // V < Min || V >= Hi -> V > Hi-1 Hi = SubOne(cast<ConstantInt>(Hi)); @@ -943,7 +940,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { // If either of the constants are nans, then the whole thing returns // false. if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN()) - return ConstantInt::getFalse(LHS->getContext()); + return Builder->getFalse(); return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0)); } @@ -1380,7 +1377,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, // into a byteswap. At least one of the two bytes would not be aligned with // their ultimate destination. if (!isPowerOf2_32(ByteMask)) return true; - unsigned InputByteNo = CountTrailingZeros_32(ByteMask); + unsigned InputByteNo = countTrailingZeros(ByteMask); // 2) The input and ultimate destinations must line up: if byte 3 of an i32 // is demanded, it needs to go into byte 0 of the result. This means that the @@ -1588,7 +1585,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { case ICmpInst::ICMP_NE: // (X != 13 | X != 15) -> true case ICmpInst::ICMP_ULT: // (X != 13 | X u< 15) -> true case ICmpInst::ICMP_SLT: // (X != 13 | X s< 15) -> true - return ConstantInt::getTrue(LHS->getContext()); + return Builder->getTrue(); } case ICmpInst::ICMP_ULT: switch (RHSCC) { @@ -1640,7 +1637,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { break; case ICmpInst::ICMP_NE: // (X u> 13 | X != 15) -> true case ICmpInst::ICMP_ULT: // (X u> 13 | X u< 15) -> true - return ConstantInt::getTrue(LHS->getContext()); + return Builder->getTrue(); case ICmpInst::ICMP_SLT: // (X u> 13 | X s< 15) -> no change break; } @@ -1655,7 +1652,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { break; case ICmpInst::ICMP_NE: // (X s> 13 | X != 15) -> true case ICmpInst::ICMP_SLT: // (X s> 13 | X s< 15) -> true - return ConstantInt::getTrue(LHS->getContext()); + return Builder->getTrue(); case ICmpInst::ICMP_ULT: // (X s> 13 | X u< 15) -> no change break; } @@ -1676,7 +1673,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { // If either of the constants are nans, then the whole thing returns // true. if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN()) - return ConstantInt::getTrue(LHS->getContext()); + return Builder->getTrue(); // Otherwise, no need to compare the two constants, compare the // rest. @@ -1779,8 +1776,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Value *Or = Builder->CreateOr(X, RHS); Or->takeName(Op0); return BinaryOperator::CreateAnd(Or, - ConstantInt::get(I.getContext(), - RHS->getValue() | C1->getValue())); + Builder->getInt(RHS->getValue() | C1->getValue())); } // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) @@ -1789,8 +1785,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Value *Or = Builder->CreateOr(X, RHS); Or->takeName(Op0); return BinaryOperator::CreateXor(Or, - ConstantInt::get(I.getContext(), - C1->getValue() & ~RHS->getValue())); + Builder->getInt(C1->getValue() & ~RHS->getValue())); } // Try to fold constant and into select arguments. @@ -1872,15 +1867,13 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { ((V1 == B && MaskedValueIsZero(V2, ~C1->getValue())) || // (V|N) (V2 == B && MaskedValueIsZero(V1, ~C1->getValue())))) // (N|V) return BinaryOperator::CreateAnd(A, - ConstantInt::get(A->getContext(), - C1->getValue()|C2->getValue())); + Builder->getInt(C1->getValue()|C2->getValue())); // Or commutes, try both ways. if (match(B, m_Or(m_Value(V1), m_Value(V2))) && ((V1 == A && MaskedValueIsZero(V2, ~C2->getValue())) || // (V|N) (V2 == A && MaskedValueIsZero(V1, ~C2->getValue())))) // (N|V) return BinaryOperator::CreateAnd(B, - ConstantInt::get(B->getContext(), - C1->getValue()|C2->getValue())); + Builder->getInt(C1->getValue()|C2->getValue())); // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2) // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0. @@ -1891,8 +1884,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { (C4->getValue() & ~C2->getValue()) == 0) { V2 = Builder->CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield"); return BinaryOperator::CreateAnd(V2, - ConstantInt::get(B->getContext(), - C1->getValue()|C2->getValue())); + Builder->getInt(C1->getValue()|C2->getValue())); } } } @@ -2160,8 +2152,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (CI->hasOneUse() && Op0C->hasOneUse()) { Instruction::CastOps Opcode = Op0C->getOpcode(); if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && - (RHS == ConstantExpr::getCast(Opcode, - ConstantInt::getTrue(I.getContext()), + (RHS == ConstantExpr::getCast(Opcode, Builder->getTrue(), Op0C->getDestTy()))) { CI->setPredicate(CI->getInversePredicate()); return CastInst::Create(Opcode, CI, Op0C->getType()); @@ -2191,8 +2182,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { Op0I->getOperand(0)); } else if (RHS->getValue().isSignBit()) { // (X + C) ^ signbit -> (X + C + signbit) - Constant *C = ConstantInt::get(I.getContext(), - RHS->getValue() + Op0CI->getValue()); + Constant *C = Builder->getInt(RHS->getValue() + Op0CI->getValue()); return BinaryOperator::CreateAdd(Op0I->getOperand(0), C); } diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 2ee1278..361acdd 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -677,7 +677,6 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) { case Instruction::Add: case Instruction::Sub: case Instruction::Mul: - case Instruction::Shl: if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear) || !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp)) return false; @@ -701,6 +700,17 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) { // Otherwise, we don't know how to analyze this BitsToClear case yet. return false; + case Instruction::Shl: + // We can promote shl(x, cst) if we can promote x. Since shl overwrites the + // upper bits we can reduce BitsToClear by the shift amount. + if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { + if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear)) + return false; + uint64_t ShiftAmt = Amt->getZExtValue(); + BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; + return true; + } + return false; case Instruction::LShr: // We can promote lshr(x, cst) if we can promote x. This requires the // ultimate 'and' to clear out the high zero bits we're clearing out though. diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 4c252c0..af8a479 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -402,7 +402,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, if (SecondTrueElement != Overdefined) { // None true -> false. if (FirstTrueElement == Undefined) - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(GEP->getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement); @@ -422,7 +422,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, if (SecondFalseElement != Overdefined) { // None false -> true. if (FirstFalseElement == Undefined) - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(GEP->getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement); @@ -712,8 +712,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, if (NumDifferences == 0) // SAME GEP? return ReplaceInstUsesWith(I, // No comparison is needed here. - ConstantInt::get(Type::getInt1Ty(I.getContext()), - ICmpInst::isTrueWhenEqual(Cond))); + Builder->getInt1(ICmpInst::isTrueWhenEqual(Cond))); else if (NumDifferences == 1 && GEPsInBounds) { Value *LHSV = GEPLHS->getOperand(DiffOperand); @@ -752,11 +751,11 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI, // (X+4) == X -> false. if (Pred == ICmpInst::ICMP_EQ) - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); // (X+4) != X -> true. if (Pred == ICmpInst::ICMP_NE) - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0, // so the values can never be equal. Similarly for all other "or equals" @@ -798,7 +797,7 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI, // (X+ -1) >s X --> X <s (MAXSINT-(-1-1)) --> X == -128 assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE); - Constant *C = ConstantInt::get(X->getContext(), CI->getValue()-1); + Constant *C = Builder->getInt(CI->getValue()-1); return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C)); } @@ -921,7 +920,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, default: llvm_unreachable("Unhandled icmp opcode!"); case ICmpInst::ICMP_EQ: if (LoOverflow && HiOverflow) - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); if (HiOverflow) return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, X, LoBound); @@ -932,7 +931,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, DivIsSigned, true)); case ICmpInst::ICMP_NE: if (LoOverflow && HiOverflow) - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); if (HiOverflow) return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, X, LoBound); @@ -944,16 +943,16 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_SLT: if (LoOverflow == +1) // Low bound is greater than input range. - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); if (LoOverflow == -1) // Low bound is less than input range. - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); return new ICmpInst(Pred, X, LoBound); case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_SGT: if (HiOverflow == +1) // High bound greater than input range. - return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); if (HiOverflow == -1) // High bound less than input range. - return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); if (Pred == ICmpInst::ICMP_UGT) return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound); return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound); @@ -1017,7 +1016,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, // If we are comparing against bits always shifted out, the // comparison cannot succeed. APInt Comp = CmpRHSV << ShAmtVal; - ConstantInt *ShiftedCmpRHS = ConstantInt::get(ICI.getContext(), Comp); + ConstantInt *ShiftedCmpRHS = Builder->getInt(Comp); if (Shr->getOpcode() == Instruction::LShr) Comp = Comp.lshr(ShAmtVal); else @@ -1025,8 +1024,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero. bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; - Constant *Cst = ConstantInt::get(Type::getInt1Ty(ICI.getContext()), - IsICMP_NE); + Constant *Cst = Builder->getInt1(IsICMP_NE); return ReplaceInstUsesWith(ICI, Cst); } @@ -1039,7 +1037,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, if (Shr->hasOneUse()) { // Otherwise strength reduce the shift into an and. APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal)); - Constant *Mask = ConstantInt::get(ICI.getContext(), Val); + Constant *Mask = Builder->getInt(Val); Value *And = Builder->CreateAnd(Shr->getOperand(0), Mask, Shr->getName()+".mask"); @@ -1072,7 +1070,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, APInt NewRHS = RHS->getValue().zext(SrcBits); NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits); return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(), NewRHS)); + Builder->getInt(NewRHS)); } } break; @@ -1115,8 +1113,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, ? ICI.getUnsignedPredicate() : ICI.getSignedPredicate(); return new ICmpInst(Pred, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(), - RHSV ^ SignBit)); + Builder->getInt(RHSV ^ SignBit)); } // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A) @@ -1127,8 +1124,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, : ICI.getSignedPredicate(); Pred = ICI.getSwappedPredicate(Pred); return new ICmpInst(Pred, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(), - RHSV ^ NotSignBit)); + Builder->getInt(RHSV ^ NotSignBit)); } } } @@ -1218,11 +1214,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // As a special case, check to see if this means that the // result is always true or false now. if (ICI.getPredicate() == ICmpInst::ICMP_EQ) - return ReplaceInstUsesWith(ICI, - ConstantInt::getFalse(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getFalse()); if (ICI.getPredicate() == ICmpInst::ICMP_NE) - return ReplaceInstUsesWith(ICI, - ConstantInt::getTrue(ICI.getContext())); + return ReplaceInstUsesWith(ICI, Builder->getTrue()); } else { ICI.setOperand(1, NewCst); Constant *NewAndCST; @@ -1344,8 +1338,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, ShAmt); if (Comp != RHS) {// Comparing against a bit that we know is zero. bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; - Constant *Cst = - ConstantInt::get(Type::getInt1Ty(ICI.getContext()), IsICMP_NE); + Constant *Cst = Builder->getInt1(IsICMP_NE); return ReplaceInstUsesWith(ICI, Cst); } @@ -1364,9 +1357,8 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if (LHSI->hasOneUse()) { // Otherwise strength reduce the shift into an and. uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits); - Constant *Mask = - ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits, - TypeBits-ShAmtVal)); + Constant *Mask = Builder->getInt(APInt::getLowBitsSet(TypeBits, + TypeBits - ShAmtVal)); Value *And = Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask"); @@ -1464,18 +1456,18 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if (ICI.isSigned()) { if (CR.getLower().isSignBit()) { return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(),CR.getUpper())); + Builder->getInt(CR.getUpper())); } else if (CR.getUpper().isSignBit()) { return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(),CR.getLower())); + Builder->getInt(CR.getLower())); } } else { if (CR.getLower().isMinValue()) { return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(),CR.getUpper())); + Builder->getInt(CR.getUpper())); } else if (CR.getUpper().isMinValue()) { return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), - ConstantInt::get(ICI.getContext(),CR.getLower())); + Builder->getInt(CR.getLower())); } } } @@ -1555,9 +1547,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) { Constant *NotCI = ConstantExpr::getNot(RHS); if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue()) - return ReplaceInstUsesWith(ICI, - ConstantInt::get(Type::getInt1Ty(ICI.getContext()), - isICMP_NE)); + return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE)); } break; @@ -1566,9 +1556,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, // If bits are being compared against that are and'd out, then the // comparison can never succeed! if ((RHSV & ~BOC->getValue()) != 0) - return ReplaceInstUsesWith(ICI, - ConstantInt::get(Type::getInt1Ty(ICI.getContext()), - isICMP_NE)); + return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE)); // If we have ((X & C) == C), turn it into ((X & C) != 0). if (RHS == BOC && RHSV.isPowerOf2()) @@ -1619,7 +1607,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, case Intrinsic::bswap: Worklist.Add(II); ICI.setOperand(0, II->getArgOperand(0)); - ICI.setOperand(1, ConstantInt::get(II->getContext(), RHSV.byteSwap())); + ICI.setOperand(1, Builder->getInt(RHSV.byteSwap())); return &ICI; case Intrinsic::ctlz: case Intrinsic::cttz: @@ -2041,19 +2029,19 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { case ICmpInst::ICMP_ULE: assert(!CI->isMaxValue(false)); // A <=u MAX -> TRUE return new ICmpInst(ICmpInst::ICMP_ULT, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()+1)); + Builder->getInt(CI->getValue()+1)); case ICmpInst::ICMP_SLE: assert(!CI->isMaxValue(true)); // A <=s MAX -> TRUE return new ICmpInst(ICmpInst::ICMP_SLT, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()+1)); + Builder->getInt(CI->getValue()+1)); case ICmpInst::ICMP_UGE: assert(!CI->isMinValue(false)); // A >=u MIN -> TRUE return new ICmpInst(ICmpInst::ICMP_UGT, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()-1)); + Builder->getInt(CI->getValue()-1)); case ICmpInst::ICMP_SGE: assert(!CI->isMinValue(true)); // A >=s MIN -> TRUE return new ICmpInst(ICmpInst::ICMP_SGT, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()-1)); + Builder->getInt(CI->getValue()-1)); } // If this comparison is a normal comparison, it demands all @@ -2192,7 +2180,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { if (Op1Max == Op0Min+1) // A <u C -> A == C-1 if min(A)+1 == C return new ICmpInst(ICmpInst::ICMP_EQ, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()-1)); + Builder->getInt(CI->getValue()-1)); // (x <u 2147483648) -> (x >s -1) -> true if sign bit clear if (CI->isMinValue(true)) @@ -2211,7 +2199,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { if (Op1Min == Op0Max-1) // A >u C -> A == C+1 if max(a)-1 == C return new ICmpInst(ICmpInst::ICMP_EQ, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()+1)); + Builder->getInt(CI->getValue()+1)); // (x >u 2147483647) -> (x <s 0) -> true if sign bit set if (CI->isMaxValue(true)) @@ -2229,7 +2217,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { if (Op1Max == Op0Min+1) // A <s C -> A == C-1 if min(A)+1 == C return new ICmpInst(ICmpInst::ICMP_EQ, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()-1)); + Builder->getInt(CI->getValue()-1)); } break; case ICmpInst::ICMP_SGT: @@ -2243,7 +2231,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { if (Op1Min == Op0Max-1) // A >s C -> A == C+1 if max(A)-1 == C return new ICmpInst(ICmpInst::ICMP_EQ, Op0, - ConstantInt::get(CI->getContext(), CI->getValue()+1)); + Builder->getInt(CI->getValue()+1)); } break; case ICmpInst::ICMP_SGE: @@ -2719,8 +2707,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ConstantInt *C1, *C2; if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) { - Constant *NC = ConstantInt::get(I.getContext(), - C1->getValue() ^ C2->getValue()); + Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue()); Value *Xor = Builder->CreateXor(C, NC); return new ICmpInst(I.getPredicate(), A, Xor); } @@ -2885,9 +2872,9 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, Pred = ICmpInst::ICMP_NE; break; case FCmpInst::FCMP_ORD: - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); case FCmpInst::FCMP_UNO: - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getFalse()); } IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); @@ -2907,8 +2894,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, if (SMax.compare(RHS) == APFloat::cmpLessThan) { // smax < 13123.0 if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); + return ReplaceInstUsesWith(I, Builder->getFalse()); } } else { // If the RHS value is > UnsignedMax, fold the comparison. This handles @@ -2919,8 +2906,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, if (UMax.compare(RHS) == APFloat::cmpLessThan) { // umax < 13123.0 if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); + return ReplaceInstUsesWith(I, Builder->getFalse()); } } @@ -2932,8 +2919,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0 if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); + return ReplaceInstUsesWith(I, Builder->getFalse()); } } else { // See if the RHS value is < UnsignedMin. @@ -2943,8 +2930,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0 if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); + return ReplaceInstUsesWith(I, Builder->getFalse()); } } @@ -2966,14 +2953,14 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, switch (Pred) { default: llvm_unreachable("Unexpected integer comparison!"); case ICmpInst::ICMP_NE: // (float)int != 4.4 --> true - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); case ICmpInst::ICMP_EQ: // (float)int == 4.4 --> false - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getFalse()); case ICmpInst::ICMP_ULE: // (float)int <= 4.4 --> int <= 4 // (float)int <= -4.4 --> false if (RHS.isNegative()) - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getFalse()); break; case ICmpInst::ICMP_SLE: // (float)int <= 4.4 --> int <= 4 @@ -2985,7 +2972,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // (float)int < -4.4 --> false // (float)int < 4.4 --> int <= 4 if (RHS.isNegative()) - return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getFalse()); Pred = ICmpInst::ICMP_ULE; break; case ICmpInst::ICMP_SLT: @@ -2998,7 +2985,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // (float)int > 4.4 --> int > 4 // (float)int > -4.4 --> true if (RHS.isNegative()) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); break; case ICmpInst::ICMP_SGT: // (float)int > 4.4 --> int > 4 @@ -3010,7 +2997,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // (float)int >= -4.4 --> true // (float)int >= 4.4 --> int > 4 if (RHS.isNegative()) - return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, Builder->getTrue()); Pred = ICmpInst::ICMP_UGT; break; case ICmpInst::ICMP_SGE: diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index df73906..e36b762 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -95,6 +95,25 @@ static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) { return MulExt.slt(Min) || MulExt.sgt(Max); } +/// \brief A helper routine of InstCombiner::visitMul(). +/// +/// If C is a vector of known powers of 2, then this function returns +/// a new vector obtained from C replacing each element with its logBase2. +/// Return a null pointer otherwise. +static Constant *getLogBase2Vector(ConstantDataVector *CV) { + const APInt *IVal; + SmallVector<Constant *, 4> Elts; + + for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) { + Constant *Elt = CV->getElementAsConstant(I); + if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2()) + return 0; + Elts.push_back(ConstantInt::get(Elt->getType(), IVal->logBase2())); + } + + return ConstantVector::get(Elts); +} + Instruction *InstCombiner::visitMul(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -108,24 +127,37 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (match(Op1, m_AllOnes())) // X * -1 == 0 - X return BinaryOperator::CreateNeg(Op0, I.getName()); - if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { - - // ((X << C1)*C2) == (X * (C2 << C1)) - if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0)) - if (SI->getOpcode() == Instruction::Shl) - if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1))) - return BinaryOperator::CreateMul(SI->getOperand(0), - ConstantExpr::getShl(CI, ShOp)); - - const APInt &Val = CI->getValue(); - if (Val.isPowerOf2()) { // Replace X*(2^C) with X << C - Constant *NewCst = ConstantInt::get(Op0->getType(), Val.logBase2()); - BinaryOperator *Shl = BinaryOperator::CreateShl(Op0, NewCst); - if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap(); - if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap(); - return Shl; + // Also allow combining multiply instructions on vectors. + { + Value *NewOp; + Constant *C1, *C2; + const APInt *IVal; + if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)), + m_Constant(C1))) && + match(C1, m_APInt(IVal))) + // ((X << C1)*C2) == (X * (C2 << C1)) + return BinaryOperator::CreateMul(NewOp, ConstantExpr::getShl(C1, C2)); + + if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) { + Constant *NewCst = 0; + if (match(C1, m_APInt(IVal)) && IVal->isPowerOf2()) + // Replace X*(2^C) with X << C, where C is either a scalar or a splat. + NewCst = ConstantInt::get(NewOp->getType(), IVal->logBase2()); + else if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(C1)) + // Replace X*(2^C) with X << C, where C is a vector of known + // constant powers of 2. + NewCst = getLogBase2Vector(CV); + + if (NewCst) { + BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst); + if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap(); + if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap(); + return Shl; + } } + } + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { // Canonicalize (X+C1)*CI -> X*CI+C1*CI. { Value *X; ConstantInt *C1; if (Op0->hasOneUse() && @@ -584,8 +616,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { *I = SI->getOperand(NonNullOperand); Worklist.Add(BBI); } else if (*I == SelectCond) { - *I = NonNullOperand == 1 ? ConstantInt::getTrue(BBI->getContext()) : - ConstantInt::getFalse(BBI->getContext()); + *I = Builder->getInt1(NonNullOperand == 1); Worklist.Add(BBI); } } @@ -817,7 +848,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { /// FP value and: /// 1) 1/C is exact, or /// 2) reciprocal is allowed. -/// If the convertion was successful, the simplified expression "X * 1/C" is +/// If the conversion was successful, the simplified expression "X * 1/C" is /// returned; otherwise, NULL is returned. /// static Instruction *CvtFDivConstToReciprocal(Value *Dividend, @@ -998,37 +1029,19 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { if (Instruction *common = commonIRemTransforms(I)) return common; - // X urem C^2 -> X and C-1 - { const APInt *C; - if (match(Op1, m_Power2(C))) - return BinaryOperator::CreateAnd(Op0, - ConstantInt::get(I.getType(), *C-1)); - } - - // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1) - if (match(Op1, m_Shl(m_Power2(), m_Value()))) { - Constant *N1 = Constant::getAllOnesValue(I.getType()); - Value *Add = Builder->CreateAdd(Op1, N1); - return BinaryOperator::CreateAnd(Op0, Add); - } - - // urem X, (select Cond, 2^C1, 2^C2) --> - // select Cond, (and X, C1-1), (and X, C2-1) - // when C1&C2 are powers of two. - { Value *Cond; const APInt *C1, *C2; - if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) { - Value *TrueAnd = Builder->CreateAnd(Op0, *C1-1, Op1->getName()+".t"); - Value *FalseAnd = Builder->CreateAnd(Op0, *C2-1, Op1->getName()+".f"); - return SelectInst::Create(Cond, TrueAnd, FalseAnd); - } - } - // (zext A) urem (zext B) --> zext (A urem B) if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0)) if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy())) return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1), I.getType()); + // X urem Y -> X and Y-1, where Y is a power of 2, + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true)) { + Constant *N1 = Constant::getAllOnesValue(I.getType()); + Value *Add = Builder->CreateAdd(Op1, N1); + return BinaryOperator::CreateAnd(Op0, Add); + } + return 0; } diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 2defe63..59502fb 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -974,7 +974,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return &SI; } - if (VectorType *VecTy = dyn_cast<VectorType>(SI.getType())) { + if (VectorType* VecTy = dyn_cast<VectorType>(SI.getType())) { unsigned VWidth = VecTy->getNumElements(); APInt UndefElts(VWidth, 0); APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); @@ -984,24 +984,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return &SI; } - if (ConstantVector *CV = dyn_cast<ConstantVector>(CondVal)) { - // Form a shufflevector instruction. - SmallVector<Constant *, 8> Mask(VWidth); - Type *Int32Ty = Type::getInt32Ty(CV->getContext()); - for (unsigned i = 0; i != VWidth; ++i) { - Constant *Elem = cast<Constant>(CV->getOperand(i)); - if (ConstantInt *E = dyn_cast<ConstantInt>(Elem)) - Mask[i] = ConstantInt::get(Int32Ty, i + (E->isZero() ? VWidth : 0)); - else if (isa<UndefValue>(Elem)) - Mask[i] = UndefValue::get(Int32Ty); - else - return 0; - } - Constant *MaskVal = ConstantVector::get(Mask); - Value *V = Builder->CreateShuffleVector(TrueVal, FalseVal, MaskVal); - return ReplaceInstUsesWith(SI, V); - } - if (isa<ConstantAggregateZero>(CondVal)) { return ReplaceInstUsesWith(SI, FalseVal); } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 8add1ea..a7bfe09 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -754,7 +754,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1); // If it's known zero, our sign bit is also zero. if (LHSKnownZero.isNegative()) - KnownZero |= LHSKnownZero; + KnownZero.setBit(KnownZero.getBitWidth() - 1); } break; case Instruction::URem: { diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index de8a3ac..d43093d 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -125,17 +125,15 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // and that it is a binary operation which is cheap to scalarize. // otherwise return NULL. if (!PHIUser->hasOneUse() || !(PHIUser->use_back() == PN) || - !(isa<BinaryOperator>(PHIUser)) || - !CheapToScalarize(PHIUser, true)) + !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true)) return NULL; // Create a scalar PHI node that will replace the vector PHI node // just before the current PHI node. - PHINode * scalarPHI = cast<PHINode>( - InsertNewInstWith(PHINode::Create(EI.getType(), - PN->getNumIncomingValues(), ""), *PN)); + PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith( + PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN)); // Scalarize each PHI operand. - for (unsigned i=0; i < PN->getNumIncomingValues(); i++) { + for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { Value *PHIInVal = PN->getIncomingValue(i); BasicBlock *inBB = PN->getIncomingBlock(i); Value *Elt = EI.getIndexOperand(); @@ -145,17 +143,17 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // scalar PHI and the second operand is extracted from the other // vector operand. BinaryOperator *B0 = cast<BinaryOperator>(PHIUser); - unsigned opId = (B0->getOperand(0) == PN) ? 1: 0; - Value *Op = Builder->CreateExtractElement( - B0->getOperand(opId), Elt, B0->getOperand(opId)->getName()+".Elt"); + unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0; + Value *Op = InsertNewInstWith( + ExtractElementInst::Create(B0->getOperand(opId), Elt, + B0->getOperand(opId)->getName() + ".Elt"), + *B0); Value *newPHIUser = InsertNewInstWith( - BinaryOperator::Create(B0->getOpcode(), scalarPHI,Op), - *B0); + BinaryOperator::Create(B0->getOpcode(), scalarPHI, Op), *B0); scalarPHI->addIncoming(newPHIUser, inBB); } else { // Scalarize PHI input: - Instruction *newEI = - ExtractElementInst::Create(PHIInVal, Elt, ""); + Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, ""); // Insert the new instruction into the predecessor basic block. Instruction *pos = dyn_cast<Instruction>(PHIInVal); BasicBlock::iterator InsertPos; @@ -222,9 +220,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { // If there's a vector PHI feeding a scalar use through this extractelement // instruction, try to scalarize the PHI. if (PHINode *PN = dyn_cast<PHINode>(EI.getOperand(0))) { - Instruction *scalarPHI = scalarizePHI(EI, PN); - if (scalarPHI) - return (scalarPHI); + Instruction *scalarPHI = scalarizePHI(EI, PN); + if (scalarPHI) + return scalarPHI; } } @@ -496,6 +494,252 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { return 0; } +/// Return true if we can evaluate the specified expression tree if the vector +/// elements were shuffled in a different order. +static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, + unsigned Depth = 5) { + // We can always reorder the elements of a constant. + if (isa<Constant>(V)) + return true; + + // We won't reorder vector arguments. No IPO here. + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + // Two users may expect different orders of the elements. Don't try it. + if (!I->hasOneUse()) + return false; + + if (Depth == 0) return false; + + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::GetElementPtr: { + for (int i = 0, e = I->getNumOperands(); i != e; ++i) { + if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1)) + return false; + } + return true; + } + case Instruction::InsertElement: { + ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2)); + if (!CI) return false; + int ElementNumber = CI->getLimitedValue(); + + // Verify that 'CI' does not occur twice in Mask. A single 'insertelement' + // can't put an element into multiple indices. + bool SeenOnce = false; + for (int i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] == ElementNumber) { + if (SeenOnce) + return false; + SeenOnce = true; + } + } + return CanEvaluateShuffled(I->getOperand(0), Mask, Depth-1); + } + } + return false; +} + +/// Rebuild a new instruction just like 'I' but with the new operands given. +/// In the event of type mismatch, the type of the operands is correct. +static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) { + // We don't want to use the IRBuilder here because we want the replacement + // instructions to appear next to 'I', not the builder's insertion point. + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + BinaryOperator *BO = cast<BinaryOperator>(I); + assert(NewOps.size() == 2 && "binary operator with #ops != 2"); + BinaryOperator *New = + BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(), + NewOps[0], NewOps[1], "", BO); + if (isa<OverflowingBinaryOperator>(BO)) { + New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap()); + New->setHasNoSignedWrap(BO->hasNoSignedWrap()); + } + if (isa<PossiblyExactOperator>(BO)) { + New->setIsExact(BO->isExact()); + } + return New; + } + case Instruction::ICmp: + assert(NewOps.size() == 2 && "icmp with #ops != 2"); + return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(), + NewOps[0], NewOps[1]); + case Instruction::FCmp: + assert(NewOps.size() == 2 && "fcmp with #ops != 2"); + return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(), + NewOps[0], NewOps[1]); + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: { + // It's possible that the mask has a different number of elements from + // the original cast. We recompute the destination type to match the mask. + Type *DestTy = + VectorType::get(I->getType()->getScalarType(), + NewOps[0]->getType()->getVectorNumElements()); + assert(NewOps.size() == 1 && "cast with #ops != 1"); + return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy, + "", I); + } + case Instruction::GetElementPtr: { + Value *Ptr = NewOps[0]; + ArrayRef<Value*> Idx = NewOps.slice(1); + GetElementPtrInst *GEP = GetElementPtrInst::Create(Ptr, Idx, "", I); + GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds()); + return GEP; + } + } + llvm_unreachable("failed to rebuild vector instructions"); +} + +Value * +InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { + // Mask.size() does not need to be equal to the number of vector elements. + + assert(V->getType()->isVectorTy() && "can't reorder non-vector elements"); + if (isa<UndefValue>(V)) { + return UndefValue::get(VectorType::get(V->getType()->getScalarType(), + Mask.size())); + } + if (isa<ConstantAggregateZero>(V)) { + return ConstantAggregateZero::get( + VectorType::get(V->getType()->getScalarType(), + Mask.size())); + } + if (Constant *C = dyn_cast<Constant>(V)) { + SmallVector<Constant *, 16> MaskValues; + for (int i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] == -1) + MaskValues.push_back(UndefValue::get(Builder->getInt32Ty())); + else + MaskValues.push_back(Builder->getInt32(Mask[i])); + } + return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()), + ConstantVector::get(MaskValues)); + } + + Instruction *I = cast<Instruction>(V); + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::Select: + case Instruction::GetElementPtr: { + SmallVector<Value*, 8> NewOps; + bool NeedsRebuild = (Mask.size() != I->getType()->getVectorNumElements()); + for (int i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *V = EvaluateInDifferentElementOrder(I->getOperand(i), Mask); + NewOps.push_back(V); + NeedsRebuild |= (V != I->getOperand(i)); + } + if (NeedsRebuild) { + return BuildNew(I, NewOps); + } + return I; + } + case Instruction::InsertElement: { + int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue(); + + // The insertelement was inserting at Element. Figure out which element + // that becomes after shuffling. The answer is guaranteed to be unique + // by CanEvaluateShuffled. + bool Found = false; + int Index = 0; + for (int e = Mask.size(); Index != e; ++Index) { + if (Mask[Index] == Element) { + Found = true; + break; + } + } + + if (!Found) + return UndefValue::get(I->getType()); + Value *V = EvaluateInDifferentElementOrder(I->getOperand(0), Mask); + return InsertElementInst::Create(V, I->getOperand(1), + Builder->getInt32(Index), "", I); + } + } + llvm_unreachable("failed to reorder elements of vector instruction!"); +} Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); @@ -527,9 +771,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (LHS == RHS || isa<UndefValue>(LHS)) { if (isa<UndefValue>(LHS) && LHS == RHS) { // shuffle(undef,undef,mask) -> undef. - Value* result = (VWidth == LHSWidth) + Value *Result = (VWidth == LHSWidth) ? LHS : UndefValue::get(SVI.getType()); - return ReplaceInstUsesWith(SVI, result); + return ReplaceInstUsesWith(SVI, Result); } // Remap any references to RHS to use LHS. @@ -576,6 +820,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (isRHSID) return ReplaceInstUsesWith(SVI, RHS); } + if (isa<UndefValue>(RHS) && CanEvaluateShuffled(LHS, Mask)) { + Value *V = EvaluateInDifferentElementOrder(LHS, Mask); + return ReplaceInstUsesWith(SVI, V); + } + // If the LHS is a shufflevector itself, see if we can combine it with this // one without producing an unusual shuffle. // Cases that might be simplified: diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index c6115e3..ec10751 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1483,7 +1483,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { Module *M = II->getParent()->getParent()->getParent(); Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), - ArrayRef<Value *>(), "", II->getParent()); + None, "", II->getParent()); } return EraseInstFromFunction(MI); } diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 623c470..22851b4 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -39,9 +39,9 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/system_error.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BlackList.h" #include "llvm/Transforms/Utils/Local.h" @@ -56,6 +56,7 @@ static const uint64_t kDefaultShadowOffset32 = 1ULL << 29; static const uint64_t kDefaultShadowOffset64 = 1ULL << 44; static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000; // < 2G. static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41; +static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa8000; static const size_t kMaxStackMallocSize = 1 << 16; // 64K static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3; @@ -208,6 +209,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize, bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX; bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64; bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64; + bool IsMIPS32 = TargetTriple.getArch() == llvm::Triple::mips || + TargetTriple.getArch() == llvm::Triple::mipsel; ShadowMapping Mapping; @@ -217,7 +220,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize, Mapping.OrShadowOffset = !IsPPC64 && !ClShort64BitOffset; Mapping.Offset = (IsAndroid || ZeroBaseShadow) ? 0 : - (LongSize == 32 ? kDefaultShadowOffset32 : + (LongSize == 32 ? + (IsMIPS32 ? kMIPS32_ShadowOffset32 : kDefaultShadowOffset32) : IsPPC64 ? kPPC64_ShadowOffset64 : kDefaultShadowOffset64); if (!ZeroBaseShadow && ClShort64BitOffset && IsX86_64 && !IsMacOSX) { assert(LongSize == 64); @@ -520,7 +524,7 @@ ModulePass *llvm::createAddressSanitizerModulePass( } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { - size_t Res = CountTrailingZeros_32(TypeSize / 8); + size_t Res = countTrailingZeros(TypeSize / 8); assert(Res < kNumberOfAccessSizes); return Res; } @@ -1270,6 +1274,10 @@ void FunctionStackPoisoner::poisonRedZones( RedzoneSize(), 1ULL << Mapping.Scale, kAsanStackPartialRedzoneMagic); + Poison = + ASan.TD->isLittleEndian() + ? support::endian::byte_swap<uint32_t, support::little>(Poison) + : support::endian::byte_swap<uint32_t, support::big>(Poison); } Value *PartialPoison = ConstantInt::get(RZTy, Poison); IRB.CreateStore(PartialPoison, IRB.CreateIntToPtr(Ptr, RZPtrTy)); diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index 1c9e053..aa265a4 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -2,6 +2,7 @@ add_llvm_library(LLVMInstrumentation AddressSanitizer.cpp BlackList.cpp BoundsChecking.cpp + DebugIR.cpp EdgeProfiling.cpp GCOVProfiling.cpp MemorySanitizer.cpp diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp new file mode 100644 index 0000000..020804f --- /dev/null +++ b/lib/Transforms/Instrumentation/DebugIR.cpp @@ -0,0 +1,310 @@ +//===--- DebugIR.cpp - Transform debug metadata to allow debugging IR -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A Module transform pass that emits a succinct version of the IR and replaces +// the source file metadata to allow debuggers to step through the IR. +// +// The location where the IR file is emitted is the same as the directory +// operand of the !llvm.dbg.cu metadata node present in the input module. The +// file name is constructed from the original file name by stripping the +// extension and replacing it with "-debug-ll" or the Postfix string specified +// at construction. +// +// FIXME: instead of replacing debug metadata, additional metadata should be +// used to point capable debuggers to the IR file without destroying the +// mapping to the original source file. +// +// FIXME: this pass should not depend on the existance of debug metadata in +// the module as it does now. Instead, it should use DIBuilder to create the +// required metadata. +// +//===----------------------------------------------------------------------===// + +#include <string> + +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/Assembly/AssemblyAnnotationWriter.h" +#include "llvm/DebugInfo.h" +#include "llvm/DIBuilder.h" +#include "llvm/InstVisitor.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Support/FormattedStream.h" +using namespace llvm; + +namespace { + +/// Builds a map of Value* to line numbers on which the Value appears in a +/// textual representation of the IR by plugging into the AssemblyWriter by +/// masquerading as an AssemblyAnnotationWriter. +class ValueToLineMap : public AssemblyAnnotationWriter { + ValueMap<const Value *, unsigned int> Lines; + typedef ValueMap<const Value *, unsigned int>::const_iterator LineIter; + +public: + + /// Prints Module to a null buffer in order to build the map of Value pointers + /// to line numbers. + ValueToLineMap(Module *M) { + raw_null_ostream ThrowAway; + M->print(ThrowAway, this); + } + + // This function is called after an Instruction, GlobalValue, or GlobalAlias + // is printed. + void printInfoComment(const Value &V, formatted_raw_ostream &Out) { + Out.flush(); + Lines.insert(std::make_pair(&V, Out.getLine() + 1)); + } + + /// If V appears on a line in the textual IR representation, sets Line to the + /// line number and returns true, otherwise returns false. + bool getLine(const Value *V, unsigned int &Line) const { + LineIter i = Lines.find(V); + if (i != Lines.end()) { + Line = i->second; + return true; + } + return false; + } +}; + +/// Removes debug intrisncs like llvm.dbg.declare and llvm.dbg.value. +class DebugIntrinsicsRemover : public InstVisitor<DebugIntrinsicsRemover> { + void remove(Instruction &I) { I.eraseFromParent(); } + +public: + void visitDbgDeclareInst(DbgDeclareInst &I) { remove(I); } + void visitDbgValueInst(DbgValueInst &I) { remove(I); } + void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { remove(I); } +}; + +/// Removes debug metadata (!dbg) nodes from all instructions as well as +/// metadata named "llvm.dbg.cu" in the Module. +class DebugMetadataRemover : public InstVisitor<DebugMetadataRemover> { +public: + void visitInstruction(Instruction &I) { + if (I.getMetadata(LLVMContext::MD_dbg)) + I.setMetadata(LLVMContext::MD_dbg, 0); + } + + void run(Module *M) { + // Remove debug metadata attached to instructions + visit(M); + + // Remove CU named metadata (and all children nodes) + NamedMDNode *Node = M->getNamedMetadata("llvm.dbg.cu"); + M->eraseNamedMetadata(Node); + } +}; + +/// Replaces line number metadata attached to Instruction nodes with new line +/// numbers provided by the ValueToLineMap. +class LineNumberReplacer : public InstVisitor<LineNumberReplacer> { + /// Table of line numbers + const ValueToLineMap &LineTable; + + /// Table of cloned values + const ValueToValueMapTy &VMap; + + /// Directory of debug metadata + const DebugInfoFinder &Finder; + +public: + LineNumberReplacer(const ValueToLineMap &VLM, const DebugInfoFinder &Finder, + const ValueToValueMapTy &VMap) + : LineTable(VLM), VMap(VMap), Finder(Finder) {} + + void visitInstruction(Instruction &I) { + DebugLoc Loc(I.getDebugLoc()); + + unsigned Col = 0; // FIXME: support columns + unsigned Line; + if (!LineTable.getLine(VMap.lookup(&I), Line)) + // Instruction has no line, it may have been removed (in the module that + // will be passed to the debugger) so there is nothing to do here. + return; + + DebugLoc NewLoc; + if (!Loc.isUnknown()) + // I had a previous debug location: re-use the DebugLoc + NewLoc = DebugLoc::get(Line, Col, Loc.getScope(I.getContext()), + Loc.getInlinedAt(I.getContext())); + else if (MDNode *scope = findFunctionMD(I.getParent()->getParent())) + // I had no previous debug location, but M has some debug information + NewLoc = + DebugLoc::get(Line, Col, scope, /*FIXME: inlined instructions*/ 0); + else + // Neither I nor M has any debug information -- nothing to do here. + // FIXME: support debugging of undecorated IR (generated by clang without + // the -g option) + return; + + addDebugLocation(const_cast<Instruction &>(I), NewLoc); + } + +private: + + /// Returns the MDNode that corresponds with F + MDNode *findFunctionMD(const Function *F) { + for (DebugInfoFinder::iterator i = Finder.subprogram_begin(), + e = Finder.subprogram_end(); + i != e; ++i) { + DISubprogram S(*i); + if (S.getFunction() == F) + return *i; + } + // cannot find F -- likely means there is no debug information + return 0; + } + + void addDebugLocation(Instruction &I, DebugLoc Loc) { + MDNode *MD = Loc.getAsMDNode(I.getContext()); + I.setMetadata(LLVMContext::MD_dbg, MD); + } +}; + +class DebugIR : public ModulePass { + std::string Postfix; + std::string Filename; + + /// Flags to control the verbosity of the generated IR file + bool hideDebugIntrinsics; + bool hideDebugMetadata; + +public: + static char ID; + + const char *getPassName() const { return "DebugIR"; } + + // FIXME: figure out if we are compiling something that already exists on disk + // in text IR form, in which case we can omit outputting a new IR file, or if + // we're building something from memory where we actually need to emit a new + // IR file for the debugger. + + /// Output a file with the same base name as the original, but with the + /// postfix "-debug-ll" appended. + DebugIR() + : ModulePass(ID), Postfix("-debug-ll"), hideDebugIntrinsics(true), + hideDebugMetadata(true) {} + + /// Customize the postfix string used to replace the extension of the + /// original filename that appears in the !llvm.dbg.cu metadata node. + DebugIR(StringRef postfix, bool hideDebugIntrinsics, bool hideDebugMetadata) + : ModulePass(ID), Postfix(postfix), + hideDebugIntrinsics(hideDebugIntrinsics), + hideDebugMetadata(hideDebugMetadata) {} + +private: + // Modify the filename embedded in the Compilation-Unit debug information of M + bool replaceFilename(Module &M, const DebugInfoFinder &Finder) { + bool changed = false; + + // Sanity check -- if llvm.dbg.cu node exists, the DebugInfoFinder + // better have found at least one CU! + if (M.getNamedMetadata("llvm.dbg.cu")) + assert(Finder.compile_unit_count() > 0 && + "Found no compile units but llvm.dbg.cu node exists"); + + for (DebugInfoFinder::iterator i = Finder.compile_unit_begin(), + e = Finder.compile_unit_end(); + i != e; ++i) { + DICompileUnit CU(*i); + Filename = CU.getFilename(); + + // Replace extension with postfix + size_t dot = Filename.find_last_of("."); + if (dot != std::string::npos) + Filename.erase(dot); + Filename += Postfix; + + CU.setFilename(Filename, M.getContext()); + changed = true; + } + return changed; + } + + /// Replace existing line number metadata with line numbers that correspond + /// with the IR file that is seen by the debugger. + void addLineNumberMetadata(Module *M, const ValueToLineMap &VLM, + const ValueToValueMapTy &VMap, + const DebugInfoFinder &Finder) { + LineNumberReplacer Replacer(VLM, Finder, VMap); + Replacer.visit(M); + } + + void writeDebugBitcode(Module *M) { + std::string error; + tool_output_file OutFile(Filename.c_str(), error); + OutFile.keep(); + formatted_raw_ostream OS; + OS.setStream(OutFile.os()); + M->print(OS, 0); + } + + void removeDebugIntrinsics(Module *M) { + DebugIntrinsicsRemover Remover; + Remover.visit(M); + } + + void removeDebugMetadata(Module *M) { + DebugMetadataRemover Remover; + Remover.run(M); + } + + void updateAndWriteDebugIRFile(Module *M, const DebugInfoFinder &Finder) { + // The module we output in text form for a debugger to open is stripped of + // 'extras' like debug intrinsics that end up in DWARF anyways and just + // clutter the debug experience. + + ValueToValueMapTy VMap; + Module *DebuggerM = CloneModule(M, VMap); + + if (hideDebugIntrinsics) + removeDebugIntrinsics(DebuggerM); + + if (hideDebugMetadata) + removeDebugMetadata(DebuggerM); + + // FIXME: remove all debug metadata from M once we support generating DWARF + // subprogram attributes. + + ValueToLineMap LineTable(DebuggerM); + addLineNumberMetadata(M, LineTable, VMap, Finder); + writeDebugBitcode(DebuggerM); + } + + bool runOnModule(Module &M) { + // Stores existing debug info needed when creating new line number entries. + DebugInfoFinder Finder; + Finder.processModule(M); + + bool changed = replaceFilename(M, Finder); + if (changed) + updateAndWriteDebugIRFile(&M, Finder); + return changed; + } +}; + +} // anonymous namespace + +char DebugIR::ID = 0; +INITIALIZE_PASS(DebugIR, "debug-ir", "Enable debugging IR", false, false) + +ModulePass *llvm::createDebugIRPass(StringRef FilenamePostfix, + bool hideDebugIntrinsics, + bool hideDebugMetadata) { + return new DebugIR(FilenamePostfix, hideDebugIntrinsics, hideDebugMetadata); +} diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 2edd151..3ce9cf6 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -34,7 +34,7 @@ #include "llvm/Support/DebugLoc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/InstIterator.h" -#include "llvm/Support/PathV2.h" +#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include <string> diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 4e75904..a3a688d 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -74,6 +74,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Triple.h" #include "llvm/ADT/ValueMap.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -228,7 +229,7 @@ class MemorySanitizer : public FunctionPass { MDNode *ColdCallWeights; /// \brief Branch weights for origin store. MDNode *OriginStoreWeights; - /// \bried Path to blacklist file. + /// \brief Path to blacklist file. SmallString<64> BlacklistFile; /// \brief The blacklist. OwningPtr<BlackList> BL; @@ -299,30 +300,30 @@ void MemorySanitizer::initializeCallbacks(Module &M) { RetvalTLS = new GlobalVariable( M, ArrayType::get(IRB.getInt64Ty(), 8), false, GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0, - GlobalVariable::GeneralDynamicTLSModel); + GlobalVariable::InitialExecTLSModel); RetvalOriginTLS = new GlobalVariable( M, OriginTy, false, GlobalVariable::ExternalLinkage, 0, - "__msan_retval_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel); + "__msan_retval_origin_tls", 0, GlobalVariable::InitialExecTLSModel); ParamTLS = new GlobalVariable( M, ArrayType::get(IRB.getInt64Ty(), 1000), false, GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0, - GlobalVariable::GeneralDynamicTLSModel); + GlobalVariable::InitialExecTLSModel); ParamOriginTLS = new GlobalVariable( M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage, - 0, "__msan_param_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel); + 0, "__msan_param_origin_tls", 0, GlobalVariable::InitialExecTLSModel); VAArgTLS = new GlobalVariable( M, ArrayType::get(IRB.getInt64Ty(), 1000), false, GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0, - GlobalVariable::GeneralDynamicTLSModel); + GlobalVariable::InitialExecTLSModel); VAArgOverflowSizeTLS = new GlobalVariable( M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_overflow_size_tls", 0, - GlobalVariable::GeneralDynamicTLSModel); + GlobalVariable::InitialExecTLSModel); OriginTLS = new GlobalVariable( M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0, - "__msan_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel); + "__msan_origin_tls", 0, GlobalVariable::InitialExecTLSModel); // We insert an empty inline asm after __msan_report* to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), @@ -365,11 +366,13 @@ bool MemorySanitizer::doInitialization(Module &M) { appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction( "__msan_init", IRB.getVoidTy(), NULL)), 0); - new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, - IRB.getInt32(TrackOrigins), "__msan_track_origins"); + if (TrackOrigins) + new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, + IRB.getInt32(TrackOrigins), "__msan_track_origins"); - new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, - IRB.getInt32(ClKeepGoing), "__msan_keep_going"); + if (ClKeepGoing) + new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage, + IRB.getInt32(ClKeepGoing), "__msan_keep_going"); return true; } @@ -768,14 +771,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (AI->hasByValAttr()) { // ByVal pointer itself has clean shadow. We copy the actual // argument shadow to the underlying memory. + // Figure out maximal valid memcpy alignment. + unsigned ArgAlign = AI->getParamAlignment(); + if (ArgAlign == 0) { + Type *EltType = A->getType()->getPointerElementType(); + ArgAlign = MS.TD->getABITypeAlignment(EltType); + } + unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); Value *Cpy = EntryIRB.CreateMemCpy( - getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), - Base, Size, AI->getParamAlignment()); + getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), Base, Size, + CopyAlign); DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n"); (void)Cpy; *ShadowPtr = getCleanShadow(V); } else { - *ShadowPtr = EntryIRB.CreateLoad(Base); + *ShadowPtr = EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment); } DEBUG(dbgs() << " ARG: " << *AI << " ==> " << **ShadowPtr << "\n"); @@ -784,7 +794,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOrigin(A, EntryIRB.CreateLoad(OriginPtr)); } } - ArgOffset += DataLayout::RoundUpAlignment(Size, 8); + ArgOffset += DataLayout::RoundUpAlignment(Size, kShadowTLSAlignment); } assert(*ShadowPtr && "Could not find shadow for an argument"); return *ShadowPtr; @@ -1963,9 +1973,29 @@ struct VarArgAMD64Helper : public VarArgHelper { } }; -VarArgHelper* CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, +/// \brief A no-op implementation of VarArgHelper. +struct VarArgNoOpHelper : public VarArgHelper { + VarArgNoOpHelper(Function &F, MemorySanitizer &MS, + MemorySanitizerVisitor &MSV) {} + + void visitCallSite(CallSite &CS, IRBuilder<> &IRB) {} + + void visitVAStartInst(VAStartInst &I) {} + + void visitVACopyInst(VACopyInst &I) {} + + void finalizeInstrumentation() {} +}; + +VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, MemorySanitizerVisitor &Visitor) { - return new VarArgAMD64Helper(Func, Msan, Visitor); + // VarArg handling is only implemented on AMD64. False positives are possible + // on other platforms. + llvm::Triple TargetTriple(Func.getParent()->getTargetTriple()); + if (TargetTriple.getArch() == llvm::Triple::x86_64) + return new VarArgAMD64Helper(Func, Msan, Visitor); + else + return new VarArgNoOpHelper(Func, Msan, Visitor); } } // namespace diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 299060a..318fa3f 100644 --- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -579,7 +579,7 @@ int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) { // Ignore all unusual sizes. return -1; } - size_t Idx = CountTrailingZeros_32(TypeSize / 8); + size_t Idx = countTrailingZeros(TypeSize / 8); assert(Idx < kNumberOfAccessSizes); return Idx; } diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index e3d51d5..fc5cf4e 100644 --- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -30,6 +30,7 @@ #include "ObjCARCAliasAnalysis.h" #include "ProvenanceAnalysis.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -107,6 +108,12 @@ namespace { return std::make_pair(Vector.begin() + Pair.first->second, false); } + iterator find(const KeyT &Key) { + typename MapTy::iterator It = Map.find(Key); + if (It == Map.end()) return Vector.end(); + return Vector.begin() + It->second; + } + const_iterator find(const KeyT &Key) const { typename MapTy::const_iterator It = Map.find(Key); if (It == Map.end()) return Vector.end(); @@ -253,6 +260,40 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) { return false; } +/// This is a wrapper around getUnderlyingObjCPtr along the lines of +/// GetUnderlyingObjects except that it returns early when it sees the first +/// alloca. +static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V) { + SmallPtrSet<const Value *, 4> Visited; + SmallVector<const Value *, 4> Worklist; + Worklist.push_back(V); + do { + const Value *P = Worklist.pop_back_val(); + P = GetUnderlyingObjCPtr(P); + + if (isa<AllocaInst>(P)) + return true; + + if (!Visited.insert(P)) + continue; + + if (const SelectInst *SI = dyn_cast<const SelectInst>(P)) { + Worklist.push_back(SI->getTrueValue()); + Worklist.push_back(SI->getFalseValue()); + continue; + } + + if (const PHINode *PN = dyn_cast<const PHINode>(P)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + Worklist.push_back(PN->getIncomingValue(i)); + continue; + } + } while (!Worklist.empty()); + + return false; +} + + /// @} /// /// \defgroup ARCOpt ARC Optimization. @@ -300,18 +341,18 @@ STATISTIC(NumNoops, "Number of no-op objc calls eliminated"); STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated"); STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases"); STATISTIC(NumRets, "Number of return value forwarding " - "retain+autoreleaes eliminated"); + "retain+autoreleases eliminated"); STATISTIC(NumRRs, "Number of retain+release paths eliminated"); STATISTIC(NumPeeps, "Number of calls peephole-optimized"); +#ifndef NDEBUG STATISTIC(NumRetainsBeforeOpt, - "Number of retains before optimization."); + "Number of retains before optimization"); STATISTIC(NumReleasesBeforeOpt, - "Number of releases before optimization."); -#ifndef NDEBUG + "Number of releases before optimization"); STATISTIC(NumRetainsAfterOpt, - "Number of retains after optimization."); + "Number of retains after optimization"); STATISTIC(NumReleasesAfterOpt, - "Number of releases after optimization."); + "Number of releases after optimization"); #endif namespace { @@ -414,8 +455,13 @@ namespace { /// sequence. SmallPtrSet<Instruction *, 2> ReverseInsertPts; + /// If this is true, we cannot perform code motion but can still remove + /// retain/release pairs. + bool CFGHazardAfflicted; + RRInfo() : - KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0) {} + KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0), + CFGHazardAfflicted(false) {} void clear(); @@ -431,6 +477,7 @@ void RRInfo::clear() { ReleaseMetadata = 0; Calls.clear(); ReverseInsertPts.clear(); + CFGHazardAfflicted = false; } namespace { @@ -457,10 +504,12 @@ namespace { Seq(S_None) {} void SetKnownPositiveRefCount() { + DEBUG(dbgs() << "Setting Known Positive.\n"); KnownPositiveRefCount = true; } void ClearKnownPositiveRefCount() { + DEBUG(dbgs() << "Clearing Known Positive.\n"); KnownPositiveRefCount = false; } @@ -516,6 +565,7 @@ PtrState::Merge(const PtrState &Other, bool TopDown) { RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease; RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end()); + RRI.CFGHazardAfflicted |= Other.RRI.CFGHazardAfflicted; // Merge the insert point sets. If there are any differences, // that makes this a partial merge. @@ -587,14 +637,26 @@ namespace { /// definition. void SetAsExit() { BottomUpPathCount = 1; } + /// Attempt to find the PtrState object describing the top down state for + /// pointer Arg. Return a new initialized PtrState describing the top down + /// state for Arg if we do not find one. PtrState &getPtrTopDownState(const Value *Arg) { return PerPtrTopDown[Arg]; } + /// Attempt to find the PtrState object describing the bottom up state for + /// pointer Arg. Return a new initialized PtrState describing the bottom up + /// state for Arg if we do not find one. PtrState &getPtrBottomUpState(const Value *Arg) { return PerPtrBottomUp[Arg]; } + /// Attempt to find the PtrState object describing the bottom up state for + /// pointer Arg. + ptr_iterator findPtrBottomUpState(const Value *Arg) { + return PerPtrBottomUp.find(Arg); + } + void clearBottomUpPointers() { PerPtrBottomUp.clear(); } @@ -608,13 +670,20 @@ namespace { void MergePred(const BBState &Other); void MergeSucc(const BBState &Other); - /// Return the number of possible unique paths from an entry to an exit + /// Compute the number of possible unique paths from an entry to an exit /// which pass through this block. This is only valid after both the /// top-down and bottom-up traversals are complete. - unsigned GetAllPathCount() const { + /// + /// Returns true if overflow occured. Returns false if overflow did not + /// occur. + bool GetAllPathCountWithOverflow(unsigned &PathCount) const { assert(TopDownPathCount != 0); assert(BottomUpPathCount != 0); - return TopDownPathCount * BottomUpPathCount; + unsigned long long Product = + (unsigned long long)TopDownPathCount*BottomUpPathCount; + PathCount = Product; + // Overflow occured if any of the upper bits of Product are set. + return Product >> 32; } // Specialized CFG utilities. @@ -992,6 +1061,9 @@ namespace { bool Changed; ProvenanceAnalysis PA; + // This is used to track if a pointer is stored into an alloca. + DenseSet<const Value *> MultiOwnersSet; + /// A flag indicating whether this optimization pass should run. bool Run; @@ -1440,11 +1512,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { case IC_RetainBlock: // If we strength reduce an objc_retainBlock to an objc_retain, continue // onto the objc_retain peephole optimizations. Otherwise break. - if (!OptimizeRetainBlockCall(F, Inst, Class)) - break; - // FALLTHROUGH - case IC_Retain: - ++NumRetainsBeforeOpt; + OptimizeRetainBlockCall(F, Inst, Class); break; case IC_RetainRV: if (OptimizeRetainRVCall(F, Inst)) @@ -1453,9 +1521,6 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { case IC_AutoreleaseRV: OptimizeAutoreleaseRVCall(F, Inst, Class); break; - case IC_Release: - ++NumReleasesBeforeOpt; - break; } // objc_autorelease(x) -> objc_release(x) if x is otherwise unused. @@ -1472,8 +1537,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { CallInst *NewCall = CallInst::Create(getReleaseCallee(F.getParent()), Call->getArgOperand(0), "", Call); - NewCall->setMetadata(ImpreciseReleaseMDKind, - MDNode::get(C, ArrayRef<Value *>())); + NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, None)); DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) " "since x is otherwise unused.\nOld: " << *Call << "\nNew: " @@ -1640,6 +1704,7 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq, PtrState &S, bool &SomeSuccHasSame, bool &AllSuccsHaveSame, + bool &NotAllSeqEqualButKnownSafe, bool &ShouldContinue) { switch (SuccSSeq) { case S_CanRelease: { @@ -1647,6 +1712,7 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq, S.ClearSequenceProgress(); break; } + S.RRI.CFGHazardAfflicted = true; ShouldContinue = true; break; } @@ -1658,6 +1724,8 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq, case S_MovableRelease: if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) AllSuccsHaveSame = false; + else + NotAllSeqEqualButKnownSafe = true; break; case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); @@ -1673,7 +1741,8 @@ static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq, const bool SuccSRRIKnownSafe, PtrState &S, bool &SomeSuccHasSame, - bool &AllSuccsHaveSame) { + bool &AllSuccsHaveSame, + bool &NotAllSeqEqualButKnownSafe) { switch (SuccSSeq) { case S_CanRelease: SomeSuccHasSame = true; @@ -1684,6 +1753,8 @@ static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq, case S_Use: if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) AllSuccsHaveSame = false; + else + NotAllSeqEqualButKnownSafe = true; break; case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); @@ -1719,6 +1790,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); bool SomeSuccHasSame = false; bool AllSuccsHaveSame = true; + bool NotAllSeqEqualButKnownSafe = false; succ_const_iterator SI(TI), SE(TI, false); @@ -1750,17 +1822,17 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, switch(S.GetSeq()) { case S_Use: { bool ShouldContinue = false; - CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, - SomeSuccHasSame, AllSuccsHaveSame, + CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, SomeSuccHasSame, + AllSuccsHaveSame, NotAllSeqEqualButKnownSafe, ShouldContinue); if (ShouldContinue) continue; break; } case S_CanRelease: { - CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, - S, SomeSuccHasSame, - AllSuccsHaveSame); + CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, + SomeSuccHasSame, AllSuccsHaveSame, + NotAllSeqEqualButKnownSafe); break; } case S_Retain: @@ -1775,8 +1847,15 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, // If the state at the other end of any of the successor edges // matches the current state, require all edges to match. This // guards against loops in the middle of a sequence. - if (SomeSuccHasSame && !AllSuccsHaveSame) + if (SomeSuccHasSame && !AllSuccsHaveSame) { S.ClearSequenceProgress(); + } else if (NotAllSeqEqualButKnownSafe) { + // If we would have cleared the state foregoing the fact that we are known + // safe, stop code motion. This is because whether or not it is safe to + // remove RR pairs via KnownSafe is an orthogonal concept to whether we + // are allowed to perform code motion. + S.RRI.CFGHazardAfflicted = true; + } } } @@ -1867,6 +1946,28 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, case IC_None: // These are irrelevant. return NestingDetected; + case IC_User: + // If we have a store into an alloca of a pointer we are tracking, the + // pointer has multiple owners implying that we must be more conservative. + // + // This comes up in the context of a pointer being ``KnownSafe''. In the + // presense of a block being initialized, the frontend will emit the + // objc_retain on the original pointer and the release on the pointer loaded + // from the alloca. The optimizer will through the provenance analysis + // realize that the two are related, but since we only require KnownSafe in + // one direction, will match the inner retain on the original pointer with + // the guard release on the original pointer. This is fixed by ensuring that + // in the presense of allocas we only unconditionally remove pointers if + // both our retain and our release are KnownSafe. + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand())) { + BBState::ptr_iterator I = MyStates.findPtrBottomUpState( + StripPointerCastsAndObjCCalls(SI->getValueOperand())); + if (I != MyStates.bottom_up_ptr_end()) + MultiOwnersSet.insert(I->first); + } + } + break; default: break; } @@ -2413,8 +2514,11 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> bool KnownSafe, bool &AnyPairsCompletelyEliminated) { // If a pair happens in a region where it is known that the reference count - // is already incremented, we can similarly ignore possible decrements. + // is already incremented, we can similarly ignore possible decrements unless + // we are dealing with a retainable object with multiple provenance sources. bool KnownSafeTD = true, KnownSafeBU = true; + bool MultipleOwners = false; + bool CFGHazardAfflicted = false; // Connect the dots between the top-down-collected RetainsToMove and // bottom-up-collected ReleasesToMove to form sets of related calls. @@ -2433,6 +2537,8 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> assert(It != Retains.end()); const RRInfo &NewRetainRRI = It->second; KnownSafeTD &= NewRetainRRI.KnownSafe; + MultipleOwners = + MultipleOwners || MultiOwnersSet.count(GetObjCArg(NewRetain)); for (SmallPtrSet<Instruction *, 2>::const_iterator LI = NewRetainRRI.Calls.begin(), LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) { @@ -2444,8 +2550,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> const RRInfo &NewRetainReleaseRRI = Jt->second; assert(NewRetainReleaseRRI.Calls.count(NewRetain)); if (ReleasesToMove.Calls.insert(NewRetainRelease)) { - OldDelta -= - BBStates[NewRetainRelease->getParent()].GetAllPathCount(); + + // If we overflow when we compute the path count, don't remove/move + // anything. + const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()]; + unsigned PathCount; + if (NRRBBState.GetAllPathCountWithOverflow(PathCount)) + return false; + OldDelta -= PathCount; // Merge the ReleaseMetadata and IsTailCallRelease values. if (FirstRelease) { @@ -2470,8 +2582,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> RE = NewRetainReleaseRRI.ReverseInsertPts.end(); RI != RE; ++RI) { Instruction *RIP = *RI; - if (ReleasesToMove.ReverseInsertPts.insert(RIP)) - NewDelta -= BBStates[RIP->getParent()].GetAllPathCount(); + if (ReleasesToMove.ReverseInsertPts.insert(RIP)) { + // If we overflow when we compute the path count, don't + // remove/move anything. + const BBState &RIPBBState = BBStates[RIP->getParent()]; + if (RIPBBState.GetAllPathCountWithOverflow(PathCount)) + return false; + NewDelta -= PathCount; + } } NewReleases.push_back(NewRetainRelease); } @@ -2489,6 +2607,7 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> assert(It != Releases.end()); const RRInfo &NewReleaseRRI = It->second; KnownSafeBU &= NewReleaseRRI.KnownSafe; + CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted; for (SmallPtrSet<Instruction *, 2>::const_iterator LI = NewReleaseRRI.Calls.begin(), LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) { @@ -2500,8 +2619,13 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> const RRInfo &NewReleaseRetainRRI = Jt->second; assert(NewReleaseRetainRRI.Calls.count(NewRelease)); if (RetainsToMove.Calls.insert(NewReleaseRetain)) { - unsigned PathCount = - BBStates[NewReleaseRetain->getParent()].GetAllPathCount(); + + // If we overflow when we compute the path count, don't remove/move + // anything. + const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()]; + unsigned PathCount; + if (NRRBBState.GetAllPathCountWithOverflow(PathCount)) + return false; OldDelta += PathCount; OldCount += PathCount; @@ -2513,7 +2637,11 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> RI != RE; ++RI) { Instruction *RIP = *RI; if (RetainsToMove.ReverseInsertPts.insert(RIP)) { - PathCount = BBStates[RIP->getParent()].GetAllPathCount(); + // If we overflow when we compute the path count, don't + // remove/move anything. + const BBState &RIPBBState = BBStates[RIP->getParent()]; + if (RIPBBState.GetAllPathCountWithOverflow(PathCount)) + return false; NewDelta += PathCount; NewCount += PathCount; } @@ -2526,9 +2654,12 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> if (NewRetains.empty()) break; } - // If the pointer is known incremented or nested, we can safely delete the - // pair regardless of what's between them. - if (KnownSafeTD || KnownSafeBU) { + // If the pointer is known incremented in 1 direction and we do not have + // MultipleOwners, we can safely remove the retain/releases. Otherwise we need + // to be known safe in both directions. + bool UnconditionallySafe = (KnownSafeTD && KnownSafeBU) || + ((KnownSafeTD || KnownSafeBU) && !MultipleOwners); + if (UnconditionallySafe) { RetainsToMove.ReverseInsertPts.clear(); ReleasesToMove.ReverseInsertPts.clear(); NewCount = 0; @@ -2539,6 +2670,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> // less aggressive solution which is. if (NewDelta != 0) return false; + + // At this point, we are not going to remove any RR pairs, but we still are + // able to move RR pairs. If one of our pointers is afflicted with + // CFGHazards, we cannot perform such code motion so exit early. + const bool WillPerformCodeMotion = RetainsToMove.ReverseInsertPts.size() || + ReleasesToMove.ReverseInsertPts.size(); + if (CFGHazardAfflicted && WillPerformCodeMotion) + return false; } // Determine whether the original call points are balanced in the retain and @@ -2802,23 +2941,29 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { /// Identify program paths which execute sequences of retains and releases which /// can be eliminated. bool ObjCARCOpt::OptimizeSequences(Function &F) { - /// Releases, Retains - These are used to store the results of the main flow - /// analysis. These use Value* as the key instead of Instruction* so that the - /// map stays valid when we get around to rewriting code and calls get - /// replaced by arguments. + // Releases, Retains - These are used to store the results of the main flow + // analysis. These use Value* as the key instead of Instruction* so that the + // map stays valid when we get around to rewriting code and calls get + // replaced by arguments. DenseMap<Value *, RRInfo> Releases; MapVector<Value *, RRInfo> Retains; - /// This is used during the traversal of the function to track the - /// states for each identified object at each block. + // This is used during the traversal of the function to track the + // states for each identified object at each block. DenseMap<const BasicBlock *, BBState> BBStates; // Analyze the CFG of the function, and all instructions. bool NestingDetected = Visit(F, BBStates, Retains, Releases); // Transform. - return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) && - NestingDetected; + bool AnyPairsCompletelyEliminated = PerformCodePlacement(BBStates, Retains, + Releases, + F.getParent()); + + // Cleanup. + MultiOwnersSet.clear(); + + return AnyPairsCompletelyEliminated && NestingDetected; } /// Check if there is a dependent call earlier that does not have anything in @@ -3051,6 +3196,12 @@ bool ObjCARCOpt::runOnFunction(Function &F) { PA.setAA(&getAnalysis<AliasAnalysis>()); +#ifndef NDEBUG + if (AreStatisticsEnabled()) { + GatherStatistics(F, false); + } +#endif + // This pass performs several distinct transformations. As a compile-time aid // when compiling code that isn't ObjC, skip these if the relevant ObjC // library functions aren't declared. diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 615c517..f0d29c8 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/ValueMap.h" #include "llvm/Analysis/DominatorInternals.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -88,7 +89,7 @@ namespace { /// Keeps track of non-local addresses that have been sunk into a block. /// This allows us to avoid inserting duplicate code for blocks with /// multiple load/stores of the same address. - DenseMap<Value*, Value*> SunkAddrs; + ValueMap<Value*, Value*> SunkAddrs; /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to /// be updated. @@ -1653,10 +1654,6 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // start of the block. CurInstIterator = BB->begin(); SunkAddrs.clear(); - } else { - // This address is now available for reassignment, so erase the table - // entry; we don't want to match some completely different instruction. - SunkAddrs[Addr] = 0; } } ++NumMemoryInsts; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 129af8d..996996d 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -45,6 +45,7 @@ #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <vector> using namespace llvm; using namespace PatternMatch; @@ -498,6 +499,75 @@ void ValueTable::verifyRemoved(const Value *V) const { //===----------------------------------------------------------------------===// namespace { + class GVN; + struct AvailableValueInBlock { + /// BB - The basic block in question. + BasicBlock *BB; + enum ValType { + SimpleVal, // A simple offsetted value that is accessed. + LoadVal, // A value produced by a load. + MemIntrin // A memory intrinsic which is loaded from. + }; + + /// V - The value that is live out of the block. + PointerIntPair<Value *, 2, ValType> Val; + + /// Offset - The byte offset in Val that is interesting for the load query. + unsigned Offset; + + static AvailableValueInBlock get(BasicBlock *BB, Value *V, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(V); + Res.Val.setInt(SimpleVal); + Res.Offset = Offset; + return Res; + } + + static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(MI); + Res.Val.setInt(MemIntrin); + Res.Offset = Offset; + return Res; + } + + static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(LI); + Res.Val.setInt(LoadVal); + Res.Offset = Offset; + return Res; + } + + bool isSimpleValue() const { return Val.getInt() == SimpleVal; } + bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } + bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } + + Value *getSimpleValue() const { + assert(isSimpleValue() && "Wrong accessor"); + return Val.getPointer(); + } + + LoadInst *getCoercedLoadValue() const { + assert(isCoercedLoadValue() && "Wrong accessor"); + return cast<LoadInst>(Val.getPointer()); + } + + MemIntrinsic *getMemIntrinValue() const { + assert(isMemIntrinValue() && "Wrong accessor"); + return cast<MemIntrinsic>(Val.getPointer()); + } + + /// MaterializeAdjustedValue - Emit code into this block to adjust the value + /// defined here to the specified type. This handles various coercion cases. + Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const; + }; class GVN : public FunctionPass { bool NoLoads; @@ -519,6 +589,11 @@ namespace { BumpPtrAllocator TableAllocator; SmallVector<Instruction*, 8> InstrsToErase; + + typedef SmallVector<NonLocalDepResult, 64> LoadDepVect; + typedef SmallVector<AvailableValueInBlock, 64> AvailValInBlkVect; + typedef SmallVector<BasicBlock*, 64> UnavailBlkVect; + public: static char ID; // Pass identification, replacement for typeid explicit GVN(bool noloads = false) @@ -599,11 +674,17 @@ namespace { } - // Helper fuctions - // FIXME: eliminate or document these better + // Helper fuctions of redundant load elimination bool processLoad(LoadInst *L); - bool processInstruction(Instruction *I); bool processNonLocalLoad(LoadInst *L); + void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, + AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks); + bool PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks); + + // Other helper routines + bool processInstruction(Instruction *I); bool processBlock(BasicBlock *BB); void dump(DenseMap<uint32_t, Value*> &d); bool iterateOnFunction(Function &F); @@ -612,6 +693,7 @@ namespace { void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); + BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); unsigned replaceAllDominatedUsesWith(Value *From, Value *To, const BasicBlockEdge &Root); bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); @@ -1159,114 +1241,6 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, return ConstantFoldLoadFromConstPtr(Src, &TD); } -namespace { - -struct AvailableValueInBlock { - /// BB - The basic block in question. - BasicBlock *BB; - enum ValType { - SimpleVal, // A simple offsetted value that is accessed. - LoadVal, // A value produced by a load. - MemIntrin // A memory intrinsic which is loaded from. - }; - - /// V - The value that is live out of the block. - PointerIntPair<Value *, 2, ValType> Val; - - /// Offset - The byte offset in Val that is interesting for the load query. - unsigned Offset; - - static AvailableValueInBlock get(BasicBlock *BB, Value *V, - unsigned Offset = 0) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(V); - Res.Val.setInt(SimpleVal); - Res.Offset = Offset; - return Res; - } - - static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI, - unsigned Offset = 0) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(MI); - Res.Val.setInt(MemIntrin); - Res.Offset = Offset; - return Res; - } - - static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI, - unsigned Offset = 0) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(LI); - Res.Val.setInt(LoadVal); - Res.Offset = Offset; - return Res; - } - - bool isSimpleValue() const { return Val.getInt() == SimpleVal; } - bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } - bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } - - Value *getSimpleValue() const { - assert(isSimpleValue() && "Wrong accessor"); - return Val.getPointer(); - } - - LoadInst *getCoercedLoadValue() const { - assert(isCoercedLoadValue() && "Wrong accessor"); - return cast<LoadInst>(Val.getPointer()); - } - - MemIntrinsic *getMemIntrinValue() const { - assert(isMemIntrinValue() && "Wrong accessor"); - return cast<MemIntrinsic>(Val.getPointer()); - } - - /// MaterializeAdjustedValue - Emit code into this block to adjust the value - /// defined here to the specified type. This handles various coercion cases. - Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { - Value *Res; - if (isSimpleValue()) { - Res = getSimpleValue(); - if (Res->getType() != LoadTy) { - const DataLayout *TD = gvn.getDataLayout(); - assert(TD && "Need target data to handle type mismatch case"); - Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), - *TD); - - DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " - << *getSimpleValue() << '\n' - << *Res << '\n' << "\n\n\n"); - } - } else if (isCoercedLoadValue()) { - LoadInst *Load = getCoercedLoadValue(); - if (Load->getType() == LoadTy && Offset == 0) { - Res = Load; - } else { - Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(), - gvn); - - DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " - << *getCoercedLoadValue() << '\n' - << *Res << '\n' << "\n\n\n"); - } - } else { - const DataLayout *TD = gvn.getDataLayout(); - assert(TD && "Need target data to handle type mismatch case"); - Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, - LoadTy, BB->getTerminator(), *TD); - DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset - << " " << *getMemIntrinValue() << '\n' - << *Res << '\n' << "\n\n\n"); - } - return Res; - } -}; - -} // end anonymous namespace /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value @@ -1323,48 +1297,59 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, return V; } +Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { + Value *Res; + if (isSimpleValue()) { + Res = getSimpleValue(); + if (Res->getType() != LoadTy) { + const DataLayout *TD = gvn.getDataLayout(); + assert(TD && "Need target data to handle type mismatch case"); + Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), + *TD); + + DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " + << *getSimpleValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + } else if (isCoercedLoadValue()) { + LoadInst *Load = getCoercedLoadValue(); + if (Load->getType() == LoadTy && Offset == 0) { + Res = Load; + } else { + Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(), + gvn); + + DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " + << *getCoercedLoadValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + } else { + const DataLayout *TD = gvn.getDataLayout(); + assert(TD && "Need target data to handle type mismatch case"); + Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, + LoadTy, BB->getTerminator(), *TD); + DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset + << " " << *getMemIntrinValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + return Res; +} + static bool isLifetimeStart(const Instruction *Inst) { if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst)) return II->getIntrinsicID() == Intrinsic::lifetime_start; return false; } -/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are -/// non-local by performing PHI construction. -bool GVN::processNonLocalLoad(LoadInst *LI) { - // Find the non-local dependencies of the load. - SmallVector<NonLocalDepResult, 64> Deps; - AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); - MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); - //DEBUG(dbgs() << "INVESTIGATING NONLOCAL LOAD: " - // << Deps.size() << *LI << '\n'); - - // If we had to process more than one hundred blocks to find the - // dependencies, this load isn't worth worrying about. Optimizing - // it will be too expensive. - unsigned NumDeps = Deps.size(); - if (NumDeps > 100) - return false; - - // If we had a phi translation failure, we'll have a single entry which is a - // clobber in the current block. Reject this early. - if (NumDeps == 1 && - !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) { - DEBUG( - dbgs() << "GVN: non-local load "; - WriteAsOperand(dbgs(), LI); - dbgs() << " has unknown dependencies\n"; - ); - return false; - } +void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, + AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks) { // Filter out useless results (non-locals, etc). Keep track of the blocks // where we have a value available in repl, also keep track of whether we see // dependencies that produce an unknown value for the load (such as a call // that could potentially clobber the load). - SmallVector<AvailableValueInBlock, 64> ValuesPerBlock; - SmallVector<BasicBlock*, 64> UnavailableBlocks; - + unsigned NumDeps = Deps.size(); for (unsigned i = 0, e = NumDeps; i != e; ++i) { BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); @@ -1480,35 +1465,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { } UnavailableBlocks.push_back(DepBB); - continue; } +} - // If we have no predecessors that produce a known value for this load, exit - // early. - if (ValuesPerBlock.empty()) return false; - - // If all of the instructions we depend on produce a known value for this - // load, then it is fully redundant and we can use PHI insertion to compute - // its value. Insert PHIs and remove the fully redundant value now. - if (UnavailableBlocks.empty()) { - DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); - - // Perform PHI construction. - Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); - LI->replaceAllUsesWith(V); - - if (isa<PHINode>(V)) - V->takeName(LI); - if (V->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(V); - markInstructionForDeletion(LI); - ++NumGVNLoad; - return true; - } - - if (!EnablePRE || !EnableLoadPRE) - return false; - +bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, + UnavailBlkVect &UnavailableBlocks) { // Okay, we have *some* definitions of the value. This means that the value // is available in some of our (transitive) predecessors. Lets think about // doing PRE of this load. This will involve inserting a new load into the @@ -1526,7 +1487,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { BasicBlock *LoadBB = LI->getParent(); BasicBlock *TmpBB = LoadBB; - bool allSingleSucc = true; while (TmpBB->getSinglePredecessor()) { TmpBB = TmpBB->getSinglePredecessor(); if (TmpBB == LoadBB) // Infinite (unreachable) loop. @@ -1555,7 +1515,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) FullyAvailableBlocks[UnavailableBlocks[i]] = false; - SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit; + SmallVector<BasicBlock *, 4> CriticalEdgePred; for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; ++PI) { BasicBlock *Pred = *PI; @@ -1578,20 +1538,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return false; } - unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB); - NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum)); + CriticalEdgePred.push_back(Pred); } } - if (!NeedToSplit.empty()) { - toSplit.append(NeedToSplit.begin(), NeedToSplit.end()); - return false; - } - // Decide whether PRE is profitable for this load. unsigned NumUnavailablePreds = PredLoads.size(); assert(NumUnavailablePreds != 0 && - "Fully available value should be eliminated above!"); + "Fully available value should already be eliminated!"); // If this load is unavailable in multiple predecessors, reject it. // FIXME: If we could restructure the CFG, we could make a common pred with @@ -1600,6 +1554,17 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { if (NumUnavailablePreds != 1) return false; + // Split critical edges, and update the unavailable predecessors accordingly. + for (SmallVector<BasicBlock *, 4>::iterator I = CriticalEdgePred.begin(), + E = CriticalEdgePred.end(); I != E; I++) { + BasicBlock *OrigPred = *I; + BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB); + PredLoads.erase(OrigPred); + PredLoads[NewPred] = 0; + DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->" + << LoadBB->getName() << '\n'); + } + // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; SmallVector<Instruction*, 8> NewInsts; @@ -1615,13 +1580,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // pointer if it is not available. PHITransAddr Address(LI->getPointerOperand(), TD); Value *LoadPtr = 0; - if (allSingleSucc) { - LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, - *DT, NewInsts); - } else { - Address.PHITranslateValue(LoadBB, UnavailablePred, DT); - LoadPtr = Address.getAddr(); - } + LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, + *DT, NewInsts); // If we couldn't find or insert a computation of this phi translated value, // we fail PRE. @@ -1632,24 +1592,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { break; } - // Make sure it is valid to move this load here. We have to watch out for: - // @1 = getelementptr (i8* p, ... - // test p and branch if == 0 - // load @1 - // It is valid to have the getelementptr before the test, even if p can - // be 0, as getelementptr only does address arithmetic. - // If we are not pushing the value through any multiple-successor blocks - // we do not have this case. Otherwise, check that the load is safe to - // put anywhere; this can be improved, but should be conservatively safe. - if (!allSingleSucc && - // FIXME: REEVALUTE THIS. - !isSafeToLoadUnconditionally(LoadPtr, - UnavailablePred->getTerminator(), - LI->getAlignment(), TD)) { - CanDoPRE = false; - break; - } - I->second = LoadPtr; } @@ -1659,7 +1601,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { if (MD) MD->removeInstruction(I); I->eraseFromParent(); } - return false; + // HINT:Don't revert the edge-splitting as following transformation may + // also need to split these critial edges. + return !CriticalEdgePred.empty(); } // Okay, we can eliminate this load by inserting a reload in the predecessor @@ -1714,6 +1658,72 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return true; } +/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are +/// non-local by performing PHI construction. +bool GVN::processNonLocalLoad(LoadInst *LI) { + // Step 1: Find the non-local dependencies of the load. + LoadDepVect Deps; + AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); + MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); + + // If we had to process more than one hundred blocks to find the + // dependencies, this load isn't worth worrying about. Optimizing + // it will be too expensive. + unsigned NumDeps = Deps.size(); + if (NumDeps > 100) + return false; + + // If we had a phi translation failure, we'll have a single entry which is a + // clobber in the current block. Reject this early. + if (NumDeps == 1 && + !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) { + DEBUG( + dbgs() << "GVN: non-local load "; + WriteAsOperand(dbgs(), LI); + dbgs() << " has unknown dependencies\n"; + ); + return false; + } + + // Step 2: Analyze the availability of the load + AvailValInBlkVect ValuesPerBlock; + UnavailBlkVect UnavailableBlocks; + AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks); + + // If we have no predecessors that produce a known value for this load, exit + // early. + if (ValuesPerBlock.empty()) + return false; + + // Step 3: Eliminate fully redundancy. + // + // If all of the instructions we depend on produce a known value for this + // load, then it is fully redundant and we can use PHI insertion to compute + // its value. Insert PHIs and remove the fully redundant value now. + if (UnavailableBlocks.empty()) { + DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); + + // Perform PHI construction. + Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); + LI->replaceAllUsesWith(V); + + if (isa<PHINode>(V)) + V->takeName(LI); + if (V->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + markInstructionForDeletion(LI); + ++NumGVNLoad; + return true; + } + + // Step 4: Eliminate partial redundancy. + if (!EnablePRE || !EnableLoadPRE) + return false; + + return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks); +} + + static void patchReplacementInstruction(Instruction *I, Value *Repl) { // Patch the replacement so that it is not more restrictive than the value // being replaced. @@ -2296,8 +2306,6 @@ bool GVN::runOnFunction(Function& F) { while (ShouldContinue) { DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); ShouldContinue = iterateOnFunction(F); - if (splitCriticalEdges()) - ShouldContinue = true; Changed |= ShouldContinue; ++Iteration; } @@ -2309,6 +2317,7 @@ bool GVN::runOnFunction(Function& F) { Changed |= PREChanged; } } + // FIXME: Should perform GVN again after PRE does something. PRE can move // computations into blocks where they become fully redundant. Note that // we can't do this until PRE's critical edge splitting updates memdep. @@ -2542,6 +2551,15 @@ bool GVN::performPRE(Function &F) { return Changed; } +/// Split the critical edge connecting the given two blocks, and return +/// the block inserted to the critical edge. +BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { + BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this); + if (MD) + MD->invalidateCachedPredecessors(); + return BB; +} + /// splitCriticalEdges - Split critical edges found during the previous /// iteration that may enable further optimization. bool GVN::splitCriticalEdges() { @@ -2568,9 +2586,18 @@ bool GVN::iterateOnFunction(Function &F) { RE = RPOT.end(); RI != RE; ++RI) Changed |= processBlock(*RI); #else + // Save the blocks this function have before transformation begins. GVN may + // split critical edge, and hence may invalidate the RPO/DT iterator. + // + std::vector<BasicBlock *> BBVect; + BBVect.reserve(256); for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), DE = df_end(DT->getRootNode()); DI != DE; ++DI) - Changed |= processBlock(DI->getBlock()); + BBVect.push_back(DI->getBlock()); + + for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); + I != E; I++) + Changed |= processBlock(*I); #endif return Changed; diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 8e76c78..df11e92 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -532,7 +532,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // and varies predictably *inside* the loop. Evaluate the value it // contains when the loop exits, if possible. const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); - if (!SE->isLoopInvariant(ExitValue, L)) + if (!SE->isLoopInvariant(ExitValue, L) || !isSafeToExpand(ExitValue)) continue; // Computing the value outside of the loop brings no benefit if : diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index e98ae95..14c5655 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -56,8 +56,8 @@ namespace { } bool runOnLoop(Loop *L, LPPassManager &LPM); - void simplifyLoopLatch(Loop *L); - bool rotateLoop(Loop *L); + bool simplifyLoopLatch(Loop *L); + bool rotateLoop(Loop *L, bool SimplifiedLatch); private: LoopInfo *LI; @@ -84,13 +84,14 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { // Simplify the loop latch before attempting to rotate the header // upward. Rotation may not be needed if the loop tail can be folded into the // loop exit. - simplifyLoopLatch(L); + bool SimplifiedLatch = simplifyLoopLatch(L); // One loop can be rotated multiple times. bool MadeChange = false; - while (rotateLoop(L)) + while (rotateLoop(L, SimplifiedLatch)) { MadeChange = true; - + SimplifiedLatch = false; + } return MadeChange; } @@ -212,25 +213,25 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, /// canonical form so downstream passes can handle it. /// /// I don't believe this invalidates SCEV. -void LoopRotate::simplifyLoopLatch(Loop *L) { +bool LoopRotate::simplifyLoopLatch(Loop *L) { BasicBlock *Latch = L->getLoopLatch(); if (!Latch || Latch->hasAddressTaken()) - return; + return false; BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); if (!Jmp || !Jmp->isUnconditional()) - return; + return false; BasicBlock *LastExit = Latch->getSinglePredecessor(); if (!LastExit || !L->isLoopExiting(LastExit)) - return; + return false; BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); if (!BI) - return; + return false; if (!shouldSpeculateInstrs(Latch->begin(), Jmp)) - return; + return false; DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " << LastExit->getName() << "\n"); @@ -253,10 +254,20 @@ void LoopRotate::simplifyLoopLatch(Loop *L) { if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) DT->eraseNode(Latch); Latch->eraseFromParent(); + return true; } /// Rotate loop LP. Return true if the loop is rotated. -bool LoopRotate::rotateLoop(Loop *L) { +/// +/// \param SimplifiedLatch is true if the latch was just folded into the final +/// loop exit. In this case we may want to rotate even though the new latch is +/// now an exiting branch. This rotation would have happened had the latch not +/// been simplified. However, if SimplifiedLatch is false, then we avoid +/// rotating loops in which the latch exits to avoid excessive or endless +/// rotation. LoopRotate should be repeatable and converge to a canonical +/// form. This property is satisfied because simplifying the loop latch can only +/// happen once across multiple invocations of the LoopRotate pass. +bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; @@ -276,7 +287,12 @@ bool LoopRotate::rotateLoop(Loop *L) { // If the loop latch already contains a branch that leaves the loop then the // loop is already rotated. - if (OrigLatch == 0 || L->isLoopExiting(OrigLatch)) + if (OrigLatch == 0) + return false; + + // Rotate if either the loop latch does *not* exit the loop, or if the loop + // latch was just simplified. + if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch) return false; // Check size of original header and reject loop if it is very big or we can't @@ -505,4 +521,3 @@ bool LoopRotate::rotateLoop(Loop *L) { ++NumRotated; return true; } - diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 73e44d7..b107fef 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -774,6 +774,16 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { } namespace { +class LSRUse; +} +// Check if it is legal to fold 2 base registers. +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F); +// Get the cost of the scaling factor used in F for LU. +static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F); + +namespace { /// Cost - This class is used to measure and compare candidate formulae. class Cost { @@ -785,11 +795,12 @@ class Cost { unsigned NumBaseAdds; unsigned ImmCost; unsigned SetupCost; + unsigned ScaleCost; public: Cost() : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0) {} + SetupCost(0), ScaleCost(0) {} bool operator<(const Cost &Other) const; @@ -799,9 +810,9 @@ public: // Once any of the metrics loses, they must all remain losers. bool isValid() { return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds - | ImmCost | SetupCost) != ~0u) + | ImmCost | SetupCost | ScaleCost) != ~0u) || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds - & ImmCost & SetupCost) == ~0u); + & ImmCost & SetupCost & ScaleCost) == ~0u); } #endif @@ -810,12 +821,14 @@ public: return NumRegs == ~0u; } - void RateFormula(const Formula &F, + void RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); void print(raw_ostream &OS) const; @@ -900,12 +913,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, } } -void Cost::RateFormula(const Formula &F, +void Cost::RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs) { // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { @@ -932,7 +947,12 @@ void Cost::RateFormula(const Formula &F, // Determine how many (unfolded) adds we'll need inside the loop. size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); if (NumBaseParts > 1) - NumBaseAdds += NumBaseParts - 1; + // Do not count the base and a possible second register if the target + // allows to fold 2 registers. + NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F)); + + // Accumulate non-free scaling amounts. + ScaleCost += getScalingFactorCost(TTI, LU, F); // Tally up the non-zero immediates. for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), @@ -955,6 +975,7 @@ void Cost::Loose() { NumBaseAdds = ~0u; ImmCost = ~0u; SetupCost = ~0u; + ScaleCost = ~0u; } /// operator< - Choose the lower cost. @@ -967,6 +988,8 @@ bool Cost::operator<(const Cost &Other) const { return NumIVMuls < Other.NumIVMuls; if (NumBaseAdds != Other.NumBaseAdds) return NumBaseAdds < Other.NumBaseAdds; + if (ScaleCost != Other.ScaleCost) + return ScaleCost < Other.ScaleCost; if (ImmCost != Other.ImmCost) return ImmCost < Other.ImmCost; if (SetupCost != Other.SetupCost) @@ -983,6 +1006,8 @@ void Cost::print(raw_ostream &OS) const { if (NumBaseAdds != 0) OS << ", plus " << NumBaseAdds << " base add" << (NumBaseAdds == 1 ? "" : "s"); + if (ScaleCost != 0) + OS << ", plus " << ScaleCost << " scale cost"; if (ImmCost != 0) OS << ", plus " << ImmCost << " imm cost"; if (SetupCost != 0) @@ -1359,6 +1384,58 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, F.BaseOffset, F.HasBaseReg, F.Scale); } +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F) { + // If F is used as an Addressing Mode, it may fold one Base plus one + // scaled register. If the scaled register is nil, do as if another + // element of the base regs is a 1-scaled register. + // This is possible if BaseRegs has at least 2 registers. + + // If this is not an address calculation, this is not an addressing mode + // use. + if (LU.Kind != LSRUse::Address) + return false; + + // F is already scaled. + if (F.Scale != 0) + return false; + + // We need to keep one register for the base and one to scale. + if (F.BaseRegs.size() < 2) + return false; + + return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + F.BaseGV, F.BaseOffset, F.HasBaseReg, 1); + } + +static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F) { + if (!F.Scale) + return 0; + assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, F) && "Illegal formula in use."); + + switch (LU.Kind) { + case LSRUse::Address: { + int CurScaleCost = TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, + F.BaseOffset, F.HasBaseReg, + F.Scale); + assert(CurScaleCost >= 0 && "Legal addressing mode has an illegal cost!"); + return CurScaleCost; + } + case LSRUse::ICmpZero: + // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg. + // Therefore, return 0 in case F.Scale == -1. + return F.Scale != -1; + + case LSRUse::Basic: + case LSRUse::Special: + return 0; + } + + llvm_unreachable("Invalid LSRUse Kind!"); +} + static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, @@ -3607,7 +3684,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { abs64(NewF.BaseOffset)) && (C->getValue()->getValue() + NewF.BaseOffset).countTrailingZeros() >= - CountTrailingZeros_64(NewF.BaseOffset)) + countTrailingZeros<uint64_t>(NewF.BaseOffset)) goto skip_formula; // Ok, looks good. @@ -3690,7 +3767,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // the corresponding bad register from the Regs set. Cost CostF; Regs.clear(); - CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, + CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, &LoserRegs); if (CostF.isLoser()) { // During initial formula generation, undesirable formulae are generated @@ -3726,7 +3803,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { Cost CostBest; Regs.clear(); - CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, + DT, LU); if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); @@ -4079,7 +4157,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, // the current best, prune the search at that point. NewCost = CurCost; NewRegs = CurRegs; - NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT); + NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT, + LU); if (NewCost < SolutionCost) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) { diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index be0f0e8..c325925 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -626,8 +626,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; Type *StructTy = cast<PointerType>(A->getType())->getElementType(); - uint64_t destSize = TD->getTypeAllocSize(StructTy); + if (!StructTy->isSized()) { + // The call may never return and hence the copy-instruction may never + // be executed, and therefore it's not safe to say "the destination + // has at least <cpyLen> bytes, as implied by the copy-instruction", + return false; + } + uint64_t destSize = TD->getTypeAllocSize(StructTy); if (destSize < srcSize) return false; } else { diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index be8d39e..d105f5e 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -78,7 +78,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, bool ModuleLevelChanges, SmallVectorImpl<ReturnInst*> &Returns, const char *NameSuffix, ClonedCodeInfo *CodeInfo, - ValueMapTypeRemapper *TypeMapper) { + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { assert(NameSuffix && "NameSuffix cannot be null!"); #ifndef NDEBUG @@ -147,7 +148,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II) RemapInstruction(II, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper); + TypeMapper, Materializer); } /// CloneFunction - Return a copy of the specified function, but without diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp index 37819cc..6d5f16c 100644 --- a/lib/Transforms/Utils/LoopSimplify.cpp +++ b/lib/Transforms/Utils/LoopSimplify.cpp @@ -59,6 +59,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted"); @@ -100,16 +101,16 @@ namespace { private: bool ProcessLoop(Loop *L, LPPassManager &LPM); BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit); - BasicBlock *InsertPreheaderForLoop(Loop *L); Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM, BasicBlock *Preheader); BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader); - void PlaceSplitBlockCarefully(BasicBlock *NewBB, - SmallVectorImpl<BasicBlock*> &SplitPreds, - Loop *L); }; } +static void PlaceSplitBlockCarefully(BasicBlock *NewBB, + SmallVectorImpl<BasicBlock*> &SplitPreds, + Loop *L); + char LoopSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", "Canonicalize natural loops", true, false) @@ -208,7 +209,7 @@ ReprocessLoop: // Does the loop already have a preheader? If so, don't insert one. BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { - Preheader = InsertPreheaderForLoop(L); + Preheader = InsertPreheaderForLoop(L, this); if (Preheader) { ++NumInserted; Changed = true; @@ -367,7 +368,7 @@ ReprocessLoop: /// preheader, this method is called to insert one. This method has two phases: /// preheader insertion and analysis updating. /// -BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) { +BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { BasicBlock *Header = L->getHeader(); // Compute the set of predecessors of the loop that are not in the loop. @@ -390,11 +391,11 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) { BasicBlock *PreheaderBB; if (!Header->isLandingPad()) { PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", - this); + PP); } else { SmallVector<BasicBlock*, 2> NewBBs; SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader", - ".split-lp", this, NewBBs); + ".split-lp", PP, NewBBs); PreheaderBB = NewBBs[0]; } @@ -491,9 +492,9 @@ static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT, // PlaceSplitBlockCarefully - If the block isn't already, move the new block to // right after some 'outside block' block. This prevents the preheader from // being placed inside the loop body, e.g. when the loop hasn't been rotated. -void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB, - SmallVectorImpl<BasicBlock*> &SplitPreds, - Loop *L) { +void PlaceSplitBlockCarefully(BasicBlock *NewBB, + SmallVectorImpl<BasicBlock*> &SplitPreds, + Loop *L) { // Check to see if NewBB is already well placed. Function::iterator BBI = NewBB; --BBI; for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 052ad85..6d12f7a 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -281,7 +281,7 @@ static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, return BI->getCondition(); } -/// ComputeSpeculuationCost - Compute an abstract "cost" of speculating the +/// ComputeSpeculationCost - Compute an abstract "cost" of speculating the /// given instruction, which is assumed to be safe to speculate. 1 means /// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive. static unsigned ComputeSpeculationCost(const User *I) { @@ -533,9 +533,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) if (BI->isConditional() && BI->getCondition()->hasOneUse()) if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) - if ((ICI->getPredicate() == ICmpInst::ICMP_EQ || - ICI->getPredicate() == ICmpInst::ICMP_NE) && - GetConstantInt(ICI->getOperand(1), TD)) + if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), TD)) CV = ICI->getOperand(0); // Unwrap any lossless ptrtoint cast. @@ -1083,9 +1081,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) { (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))) return false; - // If we get here, we can hoist at least one instruction. BasicBlock *BIParent = BI->getParent(); + bool Changed = false; do { // If we are hoisting the terminator instruction, don't move one (making a // broken BB), instead clone it, and remove BI. @@ -1100,6 +1098,7 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) { I2->replaceAllUsesWith(I1); I1->intersectOptionalDataWith(I2); I2->eraseFromParent(); + Changed = true; I1 = BB1_Itr++; I2 = BB2_Itr++; @@ -1119,7 +1118,23 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) { HoistTerminator: // It may not be possible to hoist an invoke. if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) - return true; + return Changed; + + for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) { + PHINode *PN; + for (BasicBlock::iterator BBI = SI->begin(); + (PN = dyn_cast<PHINode>(BBI)); ++BBI) { + Value *BB1V = PN->getIncomingValueForBlock(BB1); + Value *BB2V = PN->getIncomingValueForBlock(BB2); + if (BB1V == BB2V) + continue; + + if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V)) + return Changed; + if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V)) + return Changed; + } + } // Okay, it is safe to hoist the terminator. Instruction *NT = I1->clone(); @@ -1362,8 +1377,8 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { /// /// \return The pointer to the value of the previous store if the store can be /// hoisted into the predecessor block. 0 otherwise. -Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, - BasicBlock *StoreBB, BasicBlock *EndBB) { +static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, + BasicBlock *StoreBB, BasicBlock *EndBB) { StoreInst *StoreToHoist = dyn_cast<StoreInst>(I); if (!StoreToHoist) return 0; @@ -1522,18 +1537,23 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) { Value *OrigV = PN->getIncomingValueForBlock(BB); Value *ThenV = PN->getIncomingValueForBlock(ThenBB); + // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf. // Skip PHIs which are trivial. if (ThenV == OrigV) continue; HaveRewritablePHIs = true; - ConstantExpr *CE = dyn_cast<ConstantExpr>(ThenV); - if (!CE) + ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV); + ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV); + if (!OrigCE && !ThenCE) continue; // Known safe and cheap. - if (!isSafeToSpeculativelyExecute(CE)) + if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) || + (OrigCE && !isSafeToSpeculativelyExecute(OrigCE))) return false; - if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold) + unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE) : 0; + unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE) : 0; + if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold) return false; // Account for the cost of an unfolded ConstantExpr which could end up @@ -3643,7 +3663,7 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD, } /// ShouldBuildLookupTable - Determine whether a lookup table should be built -/// for this switch, based on the number of caes, size of the table and the +/// for this switch, based on the number of cases, size of the table and the /// types of the results. static bool ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index b5941bd..457fc80 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -22,14 +22,22 @@ using namespace llvm; // Out of line method to get vtable etc for class. void ValueMapTypeRemapper::anchor() {} +void ValueMaterializer::anchor() {} Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, - ValueMapTypeRemapper *TypeMapper) { + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { ValueToValueMapTy::iterator I = VM.find(V); // If the value already exists in the map, use it. if (I != VM.end() && I->second) return I->second; + // If we have a materializer and it can materialize a value, use that. + if (Materializer) { + if (Value *NewV = Materializer->materializeValueFor(const_cast<Value*>(V))) + return VM[V] = NewV; + } + // Global values do not need to be seeded into the VM if they // are using the identity mapping. if (isa<GlobalValue>(V) || isa<MDString>(V)) @@ -57,14 +65,14 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, return VM[V] = const_cast<Value*>(V); // Create a dummy node in case we have a metadata cycle. - MDNode *Dummy = MDNode::getTemporary(V->getContext(), ArrayRef<Value*>()); + MDNode *Dummy = MDNode::getTemporary(V->getContext(), None); VM[V] = Dummy; // Check all operands to see if any need to be remapped. for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) { Value *OP = MD->getOperand(i); if (OP == 0) continue; - Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper); + Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper, Materializer); // Use identity map if Mapped_Op is null and we can ignore missing // entries. if (Mapped_OP == OP || @@ -79,7 +87,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, if (Op == 0) Elts.push_back(0); else { - Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper); + Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper, Materializer); // Use identity map if Mapped_Op is null and we can ignore missing // entries. if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries)) @@ -109,9 +117,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) { Function *F = - cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper)); + cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper, Materializer)); BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(), VM, - Flags, TypeMapper)); + Flags, TypeMapper, Materializer)); return VM[V] = BlockAddress::get(F, BB ? BB : BA->getBasicBlock()); } @@ -121,7 +129,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, Value *Mapped = 0; for (; OpNo != NumOperands; ++OpNo) { Value *Op = C->getOperand(OpNo); - Mapped = MapValue(Op, VM, Flags, TypeMapper); + Mapped = MapValue(Op, VM, Flags, TypeMapper, Materializer); if (Mapped != C) break; } @@ -149,7 +157,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // Map the rest of the operands that aren't processed yet. for (++OpNo; OpNo != NumOperands; ++OpNo) Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM, - Flags, TypeMapper)); + Flags, TypeMapper, Materializer)); } if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) @@ -173,10 +181,11 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, /// current values into those specified by VMap. /// void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, - RemapFlags Flags, ValueMapTypeRemapper *TypeMapper){ + RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer){ // Remap operands. for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) { - Value *V = MapValue(*op, VMap, Flags, TypeMapper); + Value *V = MapValue(*op, VMap, Flags, TypeMapper, Materializer); // If we aren't ignoring missing entries, assert that something happened. if (V != 0) *op = V; @@ -204,7 +213,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) { MDNode *Old = MI->second; - MDNode *New = MapValue(Old, VMap, Flags, TypeMapper); + MDNode *New = MapValue(Old, VMap, Flags, TypeMapper, Materializer); if (New != Old) I->setMetadata(MI->first, New); } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9a832f7..3693f4a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8,9 +8,9 @@ //===----------------------------------------------------------------------===// // // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops -// and generates target-independent LLVM-IR. Legalization of the IR is done -// in the codegen. However, the vectorizer uses (will use) the codegen -// interfaces to generate IR that is likely to result in an optimal binary. +// and generates target-independent LLVM-IR. +// The vectorizer uses the TargetTransformInfo analysis to estimate the costs +// of instructions in order to estimate the profitability of vectorization. // // The loop vectorizer combines consecutive loop iterations into a single // 'wide' iteration. After this transformation the index is incremented @@ -80,6 +80,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/PatternMatch.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -118,11 +119,11 @@ static const unsigned TinyTripCountUnrollThreshold = 128; /// than this number of comparisons. static const unsigned RuntimeMemoryCheckThreshold = 8; -/// We use a metadata with this name to indicate that a scalar loop was -/// vectorized and that we don't need to re-vectorize it if we run into it -/// again. -static const char* -AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized"; +/// Maximum simd width. +static const unsigned MaxVectorWidth = 64; + +/// Maximum vectorization unroll count. +static const unsigned MaxUnrollFactor = 16; namespace { @@ -216,7 +217,7 @@ private: /// This function adds 0, 1, 2 ... to each vector element, starting at zero. /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). /// The sequence starts at StartIndex. - Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate); + Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -312,10 +313,85 @@ private: PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; + /// Holds the extended (to the widest induction type) start index. + Value *ExtendedIdx; /// Maps scalars to widened vectors. ValueMap WidenMap; }; +/// \brief Check if conditionally executed loads are hoistable. +/// +/// This class has two functions: isHoistableLoad and canHoistAllLoads. +/// isHoistableLoad should be called on all load instructions that are executed +/// conditionally. After all conditional loads are processed, the client should +/// call canHoistAllLoads to determine if all of the conditional executed loads +/// have an unconditional memory access to the same memory address in the loop. +class LoadHoisting { + typedef SmallPtrSet<Value *, 8> MemorySet; + + Loop *TheLoop; + DominatorTree *DT; + MemorySet CondLoadAddrSet; + +public: + LoadHoisting(Loop *L, DominatorTree *D) : TheLoop(L), DT(D) {} + + /// \brief Check if the instruction is a load with a identifiable address. + bool isHoistableLoad(Instruction *L); + + /// \brief Check if all of the conditional loads are hoistable because there + /// exists an unconditional memory access to the same address in the loop. + bool canHoistAllLoads(); +}; + +bool LoadHoisting::isHoistableLoad(Instruction *L) { + LoadInst *LI = dyn_cast<LoadInst>(L); + if (!LI) + return false; + + CondLoadAddrSet.insert(LI->getPointerOperand()); + return true; +} + +static void addMemAccesses(BasicBlock *BB, SmallPtrSet<Value *, 8> &Set) { + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) { + if (LoadInst *LI = dyn_cast<LoadInst>(BI)) // Try a load. + Set.insert(LI->getPointerOperand()); + else if (StoreInst *SI = dyn_cast<StoreInst>(BI)) // Try a store. + Set.insert(SI->getPointerOperand()); + } +} + +bool LoadHoisting::canHoistAllLoads() { + // No conditional loads. + if (CondLoadAddrSet.empty()) + return true; + + MemorySet UncondMemAccesses; + std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); + BasicBlock *LoopLatch = TheLoop->getLoopLatch(); + + // Iterate over the unconditional blocks and collect memory access addresses. + for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { + BasicBlock *BB = LoopBlocks[i]; + + // Ignore conditional blocks. + if (BB != LoopLatch && !DT->dominates(BB, LoopLatch)) + continue; + + addMemAccesses(BB, UncondMemAccesses); + } + + // And make sure there is a matching unconditional access for every + // conditional load. + for (MemorySet::iterator MI = CondLoadAddrSet.begin(), + ME = CondLoadAddrSet.end(); MI != ME; ++MI) + if (!UncondMemAccesses.count(*MI)) + return false; + + return true; +} + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -335,7 +411,8 @@ public: DominatorTree *DT, TargetTransformInfo* TTI, AliasAnalysis *AA, TargetLibraryInfo *TLI) : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), - Induction(0) {} + Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), + LoadSpeculation(L, DT) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -347,7 +424,8 @@ public: RK_IntegerXor, ///< Bitwise or logical XOR of numbers. RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). RK_FloatAdd, ///< Sum of floats. - RK_FloatMult ///< Product of floats. + RK_FloatMult, ///< Product of floats. + RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). }; /// This enum represents the kinds of inductions that we support. @@ -365,7 +443,9 @@ public: MRK_UIntMin, MRK_UIntMax, MRK_SIntMin, - MRK_SIntMax + MRK_SIntMax, + MRK_FloatMin, + MRK_FloatMax }; /// This POD struct holds information about reduction variables. @@ -379,7 +459,7 @@ public: // The starting value of the reduction. // It does not have to be zero! - Value *StartValue; + TrackingVH<Value> StartValue; // The instruction who's value is used outside the loop. Instruction *LoopExitInstr; // The kind of the reduction. @@ -424,7 +504,7 @@ public: /// This flag indicates if we need to add the runtime check. bool Need; /// Holds the pointers that we need to check. - SmallVector<Value*, 2> Pointers; + SmallVector<TrackingVH<Value>, 2> Pointers; /// Holds the pointer value at the beginning of the loop. SmallVector<const SCEV*, 2> Starts; /// Holds the pointer value at the end of the loop. @@ -438,7 +518,7 @@ public: InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} InductionInfo() : StartValue(0), IK(IK_NoInduction) {} /// Start value. - Value *StartValue; + TrackingVH<Value> StartValue; /// Induction kind. InductionKind IK; }; @@ -470,6 +550,9 @@ public: /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Returns the widest induction type. + Type *getWidestInductionType() { return WidestIndTy; } + /// Returns True if V is an induction variable in this loop. bool isInductionVariable(const Value *V); @@ -498,8 +581,7 @@ public: /// This function returns the identity element (or neutral element) for /// the operation K. - static Constant *getReductionIdentity(ReductionKind K, Type *Tp, - MinMaxReductionKind MinMaxK); + static Constant *getReductionIdentity(ReductionKind K, Type *Tp); private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -577,6 +659,8 @@ private: /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. InductionList Inductions; + /// Holds the widest induction type encountered. + Type *WidestIndTy; /// Allowed outside users. This holds the reduction /// vars which can be accessed from outside the loop. @@ -587,6 +671,11 @@ private: /// We need to check that all of the pointers in this list are disjoint /// at runtime. RuntimePointerCheck PtrRtCheck; + /// Can we assume the absence of NaNs. + bool HasFunNoNaNAttr; + + /// Utility to determine whether loads can be speculated. + LoadHoisting LoadSpeculation; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -679,6 +768,126 @@ private: const TargetLibraryInfo *TLI; }; +/// Utility class for getting and setting loop vectorizer hints in the form +/// of loop metadata. +struct LoopVectorizeHints { + /// Vectorization width. + unsigned Width; + /// Vectorization unroll factor. + unsigned Unroll; + + LoopVectorizeHints(const Loop *L) + : Width(VectorizationFactor) + , Unroll(VectorizationUnroll) + , LoopID(L->getLoopID()) { + getHints(L); + // The command line options override any loop metadata except for when + // width == 1 which is used to indicate the loop is already vectorized. + if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1) + Width = VectorizationFactor; + if (VectorizationUnroll.getNumOccurrences() > 0) + Unroll = VectorizationUnroll; + } + + /// Return the loop vectorizer metadata prefix. + static StringRef Prefix() { return "llvm.vectorizer."; } + + MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) { + SmallVector<Value*, 2> Vals; + Vals.push_back(MDString::get(Context, Name)); + Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V)); + return MDNode::get(Context, Vals); + } + + /// Mark the loop L as already vectorized by setting the width to 1. + void setAlreadyVectorized(Loop *L) { + LLVMContext &Context = L->getHeader()->getContext(); + + Width = 1; + + // Create a new loop id with one more operand for the already_vectorized + // hint. If the loop already has a loop id then copy the existing operands. + SmallVector<Value*, 4> Vals(1); + if (LoopID) + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) + Vals.push_back(LoopID->getOperand(i)); + + Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width)); + + MDNode *NewLoopID = MDNode::get(Context, Vals); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + L->setLoopID(NewLoopID); + if (LoopID) + LoopID->replaceAllUsesWith(NewLoopID); + + LoopID = NewLoopID; + } + +private: + MDNode *LoopID; + + /// Find hints specified in the loop metadata. + void getHints(const Loop *L) { + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = 0; + SmallVector<Value*, 4> Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast<MDString>(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast<MDString>(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the vectorizer prefix. + StringRef Hint = S->getString(); + if (!Hint.startswith(Prefix())) + continue; + // Remove the prefix. + Hint = Hint.substr(Prefix().size(), StringRef::npos); + + if (Args.size() == 1) + getHint(Hint, Args[0]); + } + } + + // Check string hint with one operand. + void getHint(StringRef Hint, Value *Arg) { + const ConstantInt *C = dyn_cast<ConstantInt>(Arg); + if (!C) return; + unsigned Val = C->getZExtValue(); + + if (Hint == "width") { + assert(isPowerOf2_32(Val) && Val <= MaxVectorWidth && + "Invalid width metadata"); + Width = Val; + } else if (Hint == "unroll") { + assert(isPowerOf2_32(Val) && Val <= MaxUnrollFactor && + "Invalid unroll metadata"); + Unroll = Val; + } else + DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint); + } +}; + /// The LoopVectorize Pass. struct LoopVectorize : public LoopPass { /// Pass identification, replacement for typeid @@ -717,6 +926,13 @@ struct LoopVectorize : public LoopPass { DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); + LoopVectorizeHints Hints(L); + + if (Hints.Width == 1) { + DEBUG(dbgs() << "LV: Not vectorizing.\n"); + return false; + } + // Check if it is legal to vectorize the loop. LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI); if (!LVL.canVectorize()) { @@ -744,10 +960,10 @@ struct LoopVectorize : public LoopPass { // Select the optimal vectorization factor. LoopVectorizationCostModel::VectorizationFactor VF; - VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); + VF = CM.selectVectorizationFactor(OptForSize, Hints.Width); // Select the unroll factor. - unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll, - VF.Width, VF.Cost); + unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width, + VF.Cost); if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); @@ -762,6 +978,9 @@ struct LoopVectorize : public LoopPass { InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); LB.vectorize(&LVL); + // Mark the loop as already vectorized to avoid vectorizing again. + Hints.setAlreadyVectorized(L); + DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; } @@ -794,7 +1013,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); assert(AR && "Invalid addrec expression"); - const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); + const SCEV *Ex = SE->getBackedgeTakenCount(Lp); const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); Pointers.push_back(Ptr); Starts.push_back(AR->getStart()); @@ -825,7 +1044,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, +Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, bool Negate) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && @@ -838,8 +1057,8 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, // Create a vector of consecutive numbers from zero to VF. for (int i = 0; i < VLen; ++i) { - int Idx = Negate ? (-i): i; - Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx)); + int64_t Idx = Negate ? (-i) : i; + Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); } // Add the consecutive indices to the vector value. @@ -1229,21 +1448,15 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); - // Mark the old scalar loop with metadata that tells us not to vectorize this - // loop again if we run into it. - MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>()); - OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD); - // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we // don't have a single induction variable. OldInduction = Legal->getInduction(); - Type *IdxTy = OldInduction ? OldInduction->getType() : - DL->getIntPtrType(SE->getContext()); + Type *IdxTy = Legal->getWidestInductionType(); // Find the loop boundaries. - const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); + const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); // Get the total trip count from the count by adding 1. @@ -1261,9 +1474,11 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable // then we know that it starts at zero. - Value *StartIdx = OldInduction ? - OldInduction->getIncomingValueForBlock(BypassBlock): - ConstantInt::get(IdxTy, 0); + Builder.SetInsertPoint(BypassBlock->getTerminator()); + Value *StartIdx = ExtendedIdx = OldInduction ? + Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), + IdxTy): + ConstantInt::get(IdxTy, 0); assert(BypassBlock && "Invalid loop structure"); LoopBypassBlocks.push_back(BypassBlock); @@ -1357,76 +1572,101 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { PHINode *ResumeIndex = 0; LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); + // Set builder to point to last bypass block. + BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; LoopVectorizationLegality::InductionInfo II = I->second; - PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", + + Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); + PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", MiddleBlock->getTerminator()); + // We might have extended the type of the induction variable but we need a + // truncated version for the scalar loop. + PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? + PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", + MiddleBlock->getTerminator()) : 0; + Value *EndValue = 0; switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); case LoopVectorizationLegality::IK_IntInduction: { - // Handle the integer induction counter: + // Handle the integer induction counter. assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); - assert(OrigPhi == OldInduction && "Unknown integer PHI"); - // We know what the end value is. - EndValue = IdxEndRoundDown; - // We also know which PHI node holds it. - ResumeIndex = ResumeVal; + + // We have the canonical induction variable. + if (OrigPhi == OldInduction) { + // Create a truncated version of the resume value for the scalar loop, + // we might have promoted the type to a larger width. + EndValue = + BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); + // The new PHI merges the original incoming value, in case of a bypass, + // or the value at the end of the vectorized loop. + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); + TruncResumeVal->addIncoming(EndValue, VecBody); + + // We know what the end value is. + EndValue = IdxEndRoundDown; + // We also know which PHI node holds it. + ResumeIndex = ResumeVal; + break; + } + + // Not the canonical induction variable - add the vector loop count to the + // start value. + Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, + II.StartValue->getType(), + "cast.crd"); + EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); break; } case LoopVectorizationLegality::IK_ReverseIntInduction: { // Convert the CountRoundDown variable to the PHI size. - unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); - unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); - Value *CRD = CountRoundDown; - if (CRDSize > IISize) - CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, - II.StartValue->getType(), "tr.crd", - LoopBypassBlocks.back()->getTerminator()); - else if (CRDSize < IISize) - CRD = CastInst::Create(Instruction::SExt, CountRoundDown, - II.StartValue->getType(), - "sext.crd", - LoopBypassBlocks.back()->getTerminator()); - // Handle reverse integer induction counter: - EndValue = - BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", - LoopBypassBlocks.back()->getTerminator()); + Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, + II.StartValue->getType(), + "cast.crd"); + // Handle reverse integer induction counter. + EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); break; } case LoopVectorizationLegality::IK_PtrInduction: { // For pointer induction variables, calculate the offset using // the end index. - EndValue = - GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end", - LoopBypassBlocks.back()->getTerminator()); + EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, + "ptr.ind.end"); break; } case LoopVectorizationLegality::IK_ReversePtrInduction: { // The value at the end of the loop for the reverse pointer is calculated // by creating a GEP with a negative index starting from the start value. Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); - Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown, - "rev.ind.end", - LoopBypassBlocks.back()->getTerminator()); - EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx, - "rev.ptr.ind.end", - LoopBypassBlocks.back()->getTerminator()); + Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, + "rev.ind.end"); + EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, + "rev.ptr.ind.end"); break; } }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) - ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) { + if (OrigPhi == OldInduction) + ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); + else + ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); + } ResumeVal->addIncoming(EndValue, VecBody); // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); - OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + // The old inductions phi node in the scalar body needs the truncated value. + if (OrigPhi == OldInduction) + OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal); + else + OrigPhi->setIncomingValue(BlockIdx, ResumeVal); } // If we are generating a new induction variable then we also need to @@ -1501,8 +1741,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /// This function returns the identity element (or neutral element) for /// the operation K. Constant* -LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp, - MinMaxReductionKind MinMaxK) { +LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { switch (K) { case RK_IntegerXor: case RK_IntegerAdd: @@ -1521,24 +1760,6 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp, case RK_FloatAdd: // Adding zero to a number does not change it. return ConstantFP::get(Tp, 0.0L); - case RK_IntegerMinMax: - switch(MinMaxK) { - default: llvm_unreachable("Unknown min/max predicate"); - case MRK_UIntMin: - return ConstantInt::getAllOnesValue(Tp); - case MRK_UIntMax: - return ConstantInt::get(Tp, 0); - case MRK_SIntMin: { - unsigned BitWidth = Tp->getPrimitiveSizeInBits(); - return ConstantInt::get(Tp->getContext(), - APInt::getSignedMaxValue(BitWidth)); - } - case LoopVectorizationLegality::MRK_SIntMax: { - unsigned BitWidth = Tp->getPrimitiveSizeInBits(); - return ConstantInt::get(Tp->getContext(), - APInt::getSignedMinValue(BitWidth)); - } - } default: llvm_unreachable("Unknown reduction kind"); } @@ -1668,6 +1889,8 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { return Instruction::FAdd; case LoopVectorizationLegality::RK_IntegerMinMax: return Instruction::ICmp; + case LoopVectorizationLegality::RK_FloatMinMax: + return Instruction::FCmp; default: llvm_unreachable("Unknown reduction operation"); } @@ -1692,8 +1915,21 @@ Value *createMinMaxOp(IRBuilder<> &Builder, break; case LoopVectorizationLegality::MRK_SIntMax: P = CmpInst::ICMP_SGT; + break; + case LoopVectorizationLegality::MRK_FloatMin: + P = CmpInst::FCMP_OLT; + break; + case LoopVectorizationLegality::MRK_FloatMax: + P = CmpInst::FCMP_OGT; + break; } - Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); + + Value *Cmp; + if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax) + Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); + else + Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); + Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); return Select; } @@ -1761,16 +1997,24 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Find the reduction identity variable. Zero for addition, or, xor, // one for multiplication, -1 for And. - Constant *Iden = - LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, - VecTy->getScalarType(), - RdxDesc.MinMaxKind); - Constant *Identity = ConstantVector::getSplat(VF, Iden); - - // This vector is the Identity vector where the first element is the - // incoming scalar reduction. - Value *VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); + Value *Identity; + Value *VectorStart; + if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || + RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { + // MinMax reduction have the start value as their identify. + VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue, + "minmax.ident"); + } else { + Constant *Iden = + LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, + VecTy->getScalarType()); + Identity = ConstantVector::getSplat(VF, Iden); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + } // Fix the vector-loop phi. // We created the induction variable so we know that the @@ -1784,7 +2028,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); VectorParts &Val = getVectorValue(LoopVal); for (unsigned part = 0; part < UF; ++part) { - // Make sure to add the reduction stat value only to the + // Make sure to add the reduction stat value only to the // first unroll part. Value *StartVal = (part == 0) ? VectorStart : Identity; cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); @@ -1814,7 +2058,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *ReducedPartRdx = RdxParts[0]; unsigned Op = getReductionBinOp(RdxDesc.Kind); for (unsigned part = 1; part < UF; ++part) { - if (Op != Instruction::ICmp) + if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], ReducedPartRdx, "bin.rdx"); @@ -1845,7 +2089,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { ConstantVector::get(ShuffleMask), "rdx.shuf"); - if (Op != Instruction::ICmp) + if (Op != Instruction::ICmp && Op != Instruction::FCmp) TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"); else @@ -1982,18 +2226,33 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // We know that all PHIs in non header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. - // At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - VectorParts Cond = createEdgeMask(P->getIncomingBlock(0), - P->getParent()); - for (unsigned part = 0; part < UF; ++part) { - VectorParts &In0 = getVectorValue(P->getIncomingValue(0)); - VectorParts &In1 = getVectorValue(P->getIncomingValue(1)); - Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part], - "predphi"); + unsigned NumIncoming = P->getNumIncomingValues(); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // ( ...))) + for (unsigned In = 0; In < NumIncoming; In++) { + VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), + P->getParent()); + VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); + + for (unsigned part = 0; part < UF; ++part) { + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + if (In == 0) + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + In0[part]); + else + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Entry[part] = Builder.CreateSelect(Cond[part], In0[part], + Entry[part], "predphi"); + } } continue; } @@ -2010,10 +2269,25 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); case LoopVectorizationLegality::IK_IntInduction: { - assert(P == OldInduction && "Unexpected PHI"); - Value *Broadcasted = getBroadcastInstrs(Induction); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding 0, 1, 2 ... + assert(P->getType() == II.StartValue->getType() && "Types must match"); + Type *PhiTy = P->getType(); + Value *Broadcasted; + if (P == OldInduction) { + // Handle the canonical induction variable. We might have had to + // extend the type. + Broadcasted = Builder.CreateTrunc(Induction, PhiTy); + } else { + // Handle other induction variables that are now based on the + // canonical one. + Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, + "normalized.idx"); + NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); + Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, + "offset.idx"); + } + Broadcasted = getBroadcastInstrs(Broadcasted); + // After broadcasting the induction variable we need to make the vector + // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); continue; @@ -2022,16 +2296,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, case LoopVectorizationLegality::IK_PtrInduction: case LoopVectorizationLegality::IK_ReversePtrInduction: // Handle reverse integer and pointer inductions. - Value *StartIdx = 0; - // If we have a single integer induction variable then use it. - // Otherwise, start counting at zero. - if (OldInduction) { - LoopVectorizationLegality::InductionInfo OldII = - Legal->getInductionVars()->lookup(OldInduction); - StartIdx = OldII.StartValue; - } else { - StartIdx = ConstantInt::get(Induction->getType(), 0); - } + Value *StartIdx = ExtendedIdx; // This is the normalized GEP that starts counting at zero. Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, "normalized.idx"); @@ -2049,7 +2314,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, // After broadcasting the induction variable we need to make the // vector consecutive by adding ... -3, -2, -1, 0. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true); + Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, + true); continue; } @@ -2273,23 +2539,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!isa<BranchInst>(BB->getTerminator())) return false; - // We must have at most two predecessors because we need to convert - // all PHIs to selects. - unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); - if (Preds > 2) - return false; - // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) return false; } + // Check that we can actually speculate the hoistable loads. + if (!LoadSpeculation.canHoistAllLoads()) + return false; + // We can if-convert this loop. return true; } bool LoopVectorizationLegality::canVectorize() { - assert(TheLoop->getLoopPreheader() && "No preheader!!"); + // We must have a loop in canonical form. Loops with indirectbr in them cannot + // be canonicalized. + if (!TheLoop->getLoopPreheader()) + return false; // We can only vectorize innermost loops. if (TheLoop->getSubLoopsVector().size()) @@ -2317,7 +2584,7 @@ bool LoopVectorizationLegality::canVectorize() { TheLoop->getHeader()->getName() << "\n"); // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); + const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; @@ -2356,16 +2623,50 @@ bool LoopVectorizationLegality::canVectorize() { return true; } +static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { + if (Ty->isPointerTy()) + return DL.getIntPtrType(Ty->getContext()); + return Ty; +} + +static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) { + Ty0 = convertPointerToIntegerType(DL, Ty0); + Ty1 = convertPointerToIntegerType(DL, Ty1); + if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) + return Ty0; + return Ty1; +} + +/// \brief Check that the instruction has outside loop users and is not an +/// identified reduction variable. +static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, + SmallPtrSet<Value *, 4> &Reductions) { + // Reduction instructions are allowed to have exit users. All other + // instructions must not have external users. + if (!Reductions.count(Inst)) + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); + I != E; ++I) { + Instruction *U = cast<Instruction>(*I); + // This user may be a reduction exit value. + if (!TheLoop->contains(U)) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return true; + } + } + return false; +} + bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); - // If we marked the scalar loop as "already vectorized" then no need - // to vectorize it again. - if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) { - DEBUG(dbgs() << "LV: This loop was vectorized before\n"); - return false; - } + // Look for the attribute signaling the absence of NaNs. + Function &F = *Header->getParent(); + if (F.hasFnAttribute("no-nans-fp-math")) + HasFunNoNaNAttr = F.getAttributes().getAttribute( + AttributeSet::FunctionIndex, + "no-nans-fp-math").getValueAsString() == "true"; // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -2376,16 +2677,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { ++it) { if (PHINode *Phi = dyn_cast<PHINode>(it)) { - // This should not happen because the loop should be normalized. - if (Phi->getNumIncomingValues() != 2) { - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); - return false; - } - + Type *PhiTy = Phi->getType(); // Check that this PHI type is allowed. - if (!Phi->getType()->isIntegerTy() && - !Phi->getType()->isFloatingPointTy() && - !Phi->getType()->isPointerTy()) { + if (!PhiTy->isIntegerTy() && + !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) { DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } @@ -2393,8 +2689,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // If this PHINode is not in the header block, then we know that we // can convert it to select during if-conversion. No need to check if // the PHIs in this block are induction or reduction variables. - if (*bb != Header) - continue; + if (*bb != Header) { + // Check that this instruction has no outside users or is an + // identified reduction value with an outside user. + if(!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + continue; + return false; + } + + // We only allow if-converted PHIs with more than two incoming values. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); @@ -2402,13 +2709,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { InductionKind IK = isInductionVariable(Phi); if (IK_NoInduction != IK) { + // Get the widest type. + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); + else + WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); + // Int inductions are special because we only allow one IV. if (IK == IK_IntInduction) { - if (Induction) { - DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); - return false; - } - Induction = Phi; + // Use the phi node with the widest type as induction. Use the last + // one if there are multiple (no good reason for doing this other + // than it is expedient). + if (!Induction || PhiTy == WidestIndTy) + Induction = Phi; } DEBUG(dbgs() << "LV: Found an induction variable.\n"); @@ -2448,6 +2761,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); continue; } + if (AddReductionVar(Phi, RK_FloatMinMax)) { + DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n"); + continue; + } DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; @@ -2477,24 +2794,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (!AllowedExit.count(it)) - //Check that all of the users of the loop are inside the BB. - for (Value::use_iterator I = it->use_begin(), E = it->use_end(); - I != E; ++I) { - Instruction *U = cast<Instruction>(*I); - // This user may be a reduction exit value. - if (!TheLoop->contains(U)) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); - return false; - } - } + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + return false; + } // next instr. } if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - assert(getInductionVars()->size() && "No induction variables"); + if (Inductions.empty()) + return false; } return true; @@ -2523,9 +2833,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { Uniforms.insert(I); // Insert all operands. - for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { - Worklist.push_back(I->getOperand(i)); - } + Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); } } @@ -2776,8 +3084,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { Inst, WriteObjects, MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" - << *UI <<"\n"); + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI + << "\n"); return false; } @@ -2820,8 +3128,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { Inst, WriteObjects, MaxByteWidth)) { - DEBUG(dbgs() << "LV: Found a possible read-write reorder:" - << *UI <<"\n"); + DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI + << "\n"); return false; } } @@ -2841,6 +3149,26 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } +static bool hasMultipleUsesOf(Instruction *I, + SmallPtrSet<Instruction *, 8> &Insts) { + unsigned NumUses = 0; + for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { + if (Insts.count(dyn_cast<Instruction>(*Use))) + ++NumUses; + if (NumUses > 1) + return true; + } + + return false; +} + +static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) { + for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) + if (!Set.count(dyn_cast<Instruction>(*Use))) + return false; + return true; +} + bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, ReductionKind Kind) { if (Phi->getNumIncomingValues() != 2) @@ -2859,129 +3187,160 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = 0; - // Indicates that we found a binary operation in our scan. - bool FoundBinOp = false; + // Indicates that we found a reduction operation in our scan. + bool FoundReduxOp = false; - // Iter is our iterator. We start with the PHI node and scan for all of the - // users of this instruction. All users must be instructions that can be - // used as reduction variables (such as ADD). We may have a single - // out-of-block user. The cycle must end with the original PHI. - Instruction *Iter = Phi; + // We start with the PHI node and scan for all of the users of this + // instruction. All users must be instructions that can be used as reduction + // variables (such as ADD). We must have a single out-of-block user. The cycle + // must include the original PHI. + bool FoundStartPHI = false; // To recognize min/max patterns formed by a icmp select sequence, we store // the number of instruction we saw from the recognized min/max pattern, - // such that we don't stop when we see the phi has two uses (one by the select - // and one by the icmp) and to make sure we only see exactly the two - // instructions. - unsigned NumICmpSelectPatternInst = 0; + // to make sure we only see exactly the two instructions. + unsigned NumCmpSelectPatternInst = 0; ReductionInstDesc ReduxDesc(false, 0); - // Avoid cycles in the chain. SmallPtrSet<Instruction *, 8> VisitedInsts; - while (VisitedInsts.insert(Iter)) { - // If the instruction has no users then this is a broken - // chain and can't be a reduction variable. - if (Iter->use_empty()) + SmallVector<Instruction *, 8> Worklist; + Worklist.push_back(Phi); + VisitedInsts.insert(Phi); + + // A value in the reduction can be used: + // - By the reduction: + // - Reduction operation: + // - One use of reduction value (safe). + // - Multiple use of reduction value (not safe). + // - PHI: + // - All uses of the PHI must be the reduction (safe). + // - Otherwise, not safe. + // - By one instruction outside of the loop (safe). + // - By further instructions outside of the loop (not safe). + // - By an instruction that is not part of the reduction (not safe). + // This is either: + // * An instruction type other than PHI or the reduction operation. + // * A PHI in the header other than the initial PHI. + while (!Worklist.empty()) { + Instruction *Cur = Worklist.back(); + Worklist.pop_back(); + + // No Users. + // If the instruction has no users then this is a broken chain and can't be + // a reduction variable. + if (Cur->use_empty()) return false; - // Did we find a user inside this loop already ? - bool FoundInBlockUser = false; - // Did we reach the initial PHI node already ? - bool FoundStartPHI = false; + bool IsAPhi = isa<PHINode>(Cur); - // Is this a bin op ? - FoundBinOp |= !isa<PHINode>(Iter); + // A header PHI use other than the original PHI. + if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) + return false; - // For each of the *users* of iter. - for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); - it != e; ++it) { - Instruction *U = cast<Instruction>(*it); - // We already know that the PHI is a user. - if (U == Phi) { - FoundStartPHI = true; - continue; - } + // Reductions of instructions such as Div, and Sub is only possible if the + // LHS is the reduction variable. + if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) && + !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) && + !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) + return false; + + // Any reduction instruction must be of one of the allowed kinds. + ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); + if (!ReduxDesc.IsReduction) + return false; + + // A reduction operation must only have one use of the reduction value. + if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && + hasMultipleUsesOf(Cur, VisitedInsts)) + return false; + + // All inputs to a PHI node must be a reduction value. + if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) + return false; + + if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) || + isa<SelectInst>(Cur))) + ++NumCmpSelectPatternInst; + if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || + isa<SelectInst>(Cur))) + ++NumCmpSelectPatternInst; + + // Check whether we found a reduction operator. + FoundReduxOp |= !IsAPhi; + + // Process users of current instruction. Push non PHI nodes after PHI nodes + // onto the stack. This way we are going to have seen all inputs to PHI + // nodes once we get to them. + SmallVector<Instruction *, 8> NonPHIs; + SmallVector<Instruction *, 8> PHIs; + for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E; + ++UI) { + Instruction *Usr = cast<Instruction>(*UI); // Check if we found the exit user. - BasicBlock *Parent = U->getParent(); + BasicBlock *Parent = Usr->getParent(); if (!TheLoop->contains(Parent)) { // Exit if you find multiple outside users. if (ExitInstruction != 0) return false; - ExitInstruction = Iter; - } - - // We allow in-loop PHINodes which are not the original reduction PHI - // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE - // structure) then don't skip this PHI. - if (isa<PHINode>(Iter) && isa<PHINode>(U) && - U->getParent() != TheLoop->getHeader() && - TheLoop->contains(U) && - Iter->hasNUsesOrMore(2)) + ExitInstruction = Cur; continue; + } - // We can't have multiple inside users except for a combination of - // icmp/select both using the phi. - if (FoundInBlockUser && !NumICmpSelectPatternInst) - return false; - FoundInBlockUser = true; - - // Any reduction instr must be of one of the allowed kinds. - ReduxDesc = isReductionInstr(U, Kind, ReduxDesc); - if (!ReduxDesc.IsReduction) - return false; + // Process instructions only once (termination). + if (VisitedInsts.insert(Usr)) { + if (isa<PHINode>(Usr)) + PHIs.push_back(Usr); + else + NonPHIs.push_back(Usr); + } + // Remember that we completed the cycle. + if (Usr == Phi) + FoundStartPHI = true; + } + Worklist.append(PHIs.begin(), PHIs.end()); + Worklist.append(NonPHIs.begin(), NonPHIs.end()); + } - if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || - isa<SelectInst>(U))) - ++NumICmpSelectPatternInst; + // This means we have seen one but not the other instruction of the + // pattern or more than just a select and cmp. + if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && + NumCmpSelectPatternInst != 2) + return false; - // Reductions of instructions such as Div, and Sub is only - // possible if the LHS is the reduction variable. - if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) && - !isa<ICmpInst>(U) && U->getOperand(0) != Iter) - return false; + if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) + return false; - Iter = ReduxDesc.PatternLastInst; - } + // We found a reduction var if we have reached the original phi node and we + // only have a single instruction with out-of-loop users. - // This means we have seen one but not the other instruction of the - // pattern or more than just a select and cmp. - if (Kind == RK_IntegerMinMax && NumICmpSelectPatternInst != 2) - return false; + // This instruction is allowed to have out-of-loop users. + AllowedExit.insert(ExitInstruction); - // We found a reduction var if we have reached the original - // phi node and we only have a single instruction with out-of-loop - // users. - if (FoundStartPHI) { - // This instruction is allowed to have out-of-loop users. - AllowedExit.insert(ExitInstruction); - - // Save the description of this reduction variable. - ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, - ReduxDesc.MinMaxKind); - Reductions[Phi] = RD; - // We've ended the cycle. This is a reduction variable if we have an - // outside user and it has a binary op. - return FoundBinOp && ExitInstruction; - } - } + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, + ReduxDesc.MinMaxKind); + Reductions[Phi] = RD; + // We've ended the cycle. This is a reduction variable if we have an + // outside user and it has a binary op. - return false; + return true; } /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction /// pattern corresponding to a min(X, Y) or max(X, Y). LoopVectorizationLegality::ReductionInstDesc -LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev) { +LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, + ReductionInstDesc &Prev) { - assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) && + assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && "Expect a select instruction"); - ICmpInst *Cmp = 0; + Instruction *Cmp = 0; SelectInst *Select = 0; // We must handle the select(cmp()) as a single instruction. Advance to the // select. - if ((Cmp = dyn_cast<ICmpInst>(I))) { + if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin()))) return ReductionInstDesc(false, I); return ReductionInstDesc(Select, Prev.MinMaxKind); @@ -2990,13 +3349,14 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionIns // Only handle single use cases for now. if (!(Select = dyn_cast<SelectInst>(I))) return ReductionInstDesc(false, I); - if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0)))) + if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && + !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) return ReductionInstDesc(false, I); if (!Cmp->hasOneUse()) return ReductionInstDesc(false, I); - Value *CmpLeft = Cmp->getOperand(0); - Value *CmpRight = Cmp->getOperand(1); + Value *CmpLeft; + Value *CmpRight; // Look for a min/max pattern. if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) @@ -3007,6 +3367,14 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionIns return ReductionInstDesc(Select, MRK_SIntMax); else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return ReductionInstDesc(Select, MRK_SIntMin); + else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMin); + else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMax); + else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMin); + else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) + return ReductionInstDesc(Select, MRK_FloatMax); return ReductionInstDesc(false, I); } @@ -3021,7 +3389,8 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, default: return ReductionInstDesc(false, I); case Instruction::PHI: - if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd)) + if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && + Kind != RK_FloatMinMax)) return ReductionInstDesc(false, I); return ReductionInstDesc(I, Prev.MinMaxKind); case Instruction::Sub: @@ -3039,9 +3408,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); case Instruction::FAdd: return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: - if (Kind != RK_IntegerMinMax) + if (Kind != RK_IntegerMinMax && + (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) return ReductionInstDesc(false, I); return isMinMaxSelectCmpPattern(I, Prev); } @@ -3106,8 +3477,12 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - // We don't predicate loads/stores at the moment. - if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) + // We might be able to hoist the load. + if (it->mayReadFromMemory() && !LoadSpeculation.isHoistableLoad(it)) + return false; + + // We don't predicate stores at the moment. + if (it->mayWriteToMemory() || it->mayThrow()) return false; // The instructions below can trap. diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index cc30cc9..40e0098 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20,6 +20,7 @@ #include "VecUtils.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/ADT/MapVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -47,7 +48,7 @@ namespace { /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { - typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap; + typedef MapVector<Value*, BoUpSLP::StoreList> StoreListMap; /// Pass identification, replacement for typeid static char ID; @@ -77,6 +78,8 @@ struct SLPVectorizer : public FunctionPass { if (!DL) return false; + DEBUG(dbgs()<<"SLP: Analyzing blocks in " << F.getName() << ".\n"); + for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) { BasicBlock *BB = it; bool BBChanged = false; diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index 9b94366..21e6cdd 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -46,7 +46,7 @@ namespace llvm { BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) : - BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { + Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { numberInstructions(); } @@ -121,6 +121,7 @@ bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) { DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < CostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands,VF))); vectorizeTree(Operands, VF); i += VF - 1; Changed = true; @@ -131,7 +132,7 @@ bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) { } bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) { - ValueSet Heads, Tails; + SetVector<Value*> Heads, Tails; SmallDenseMap<Value*, Value*> ConsecutiveChain; // We may run into multiple chains that merge into a single chain. We mark the @@ -152,7 +153,8 @@ bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) { } // For stores that start but don't end a link in the chain: - for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) { + for (SetVector<Value*>::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { if (Tails.count(*it)) continue; // We found a store instr that starts a chain. Now follow the chain and try @@ -224,9 +226,14 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { } void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) { + int LastIdx = getLastIndex(Operands, Operands.size()); + Instruction *Loc = getInsertionPoint(LastIdx); + Builder.SetInsertPoint(Loc); + + assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx && + "Vectorizing with in-tree users"); + Value *Vec = vectorizeTree(Operands, Operands.size()); - BasicBlock::iterator Loc = cast<Instruction>(Vec); - IRBuilder<> Builder(++Loc); // After vectorizing the operands we need to generate extractelement // instructions and replace all of the uses of the scalar values with // the values that we extracted from the vectorized tree. @@ -243,6 +250,16 @@ int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) { LaneMap.clear(); MultiUserVals.clear(); MustScalarize.clear(); + MustExtract.clear(); + + // Find the location of the last root. + int LastRootIndex = getLastIndex(VL, VL.size()); + int FirstUserIndex = getFirstUserIndex(VL, VL.size()); + + // Don't vectorize if there are users of the tree roots inside the tree + // itself. + if (LastRootIndex > FirstUserIndex) + return max_cost; // Scan the tree and find which value is used by which lane, and which values // must be scalarized. @@ -250,7 +267,7 @@ int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) { // Check that instructions with multiple users can be vectorized. Mark unsafe // instructions. - for (ValueSet::iterator it = MultiUserVals.begin(), + for (SetVector<Value*>::iterator it = MultiUserVals.begin(), e = MultiUserVals.end(); it != e; ++it) { // Check that all of the users of this instr are within the tree // and that they are all from the same lane. @@ -258,15 +275,35 @@ int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) { for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); I != E; ++I) { if (LaneMap.find(*I) == LaneMap.end()) { - MustScalarize.insert(*it); - DEBUG(dbgs()<<"SLP: Adding " << **it << - " to MustScalarize because of an out of tree usage.\n"); - break; + DEBUG(dbgs()<<"SLP: Instr " << **it << " has multiple users.\n"); + + // We don't have an ordering problem if the user is not in this basic + // block. + Instruction *Inst = cast<Instruction>(*I); + if (Inst->getParent() != BB) { + MustExtract.insert(*it); + continue; + } + + // We don't have an ordering problem if the user is after the last root. + int Idx = InstrIdx[Inst]; + if (Idx < LastRootIndex) { + MustScalarize.insert(*it); + DEBUG(dbgs()<<"SLP: Adding to MustScalarize " + "because of an unsafe out of tree usage.\n"); + break; + } + + + DEBUG(dbgs()<<"SLP: Adding to MustExtract " + "because of a safe out of tree usage.\n"); + MustExtract.insert(*it); + continue; } if (Lane == -1) Lane = LaneMap[*I]; if (Lane != LaneMap[*I]) { MustScalarize.insert(*it); - DEBUG(dbgs()<<"Adding " << **it << + DEBUG(dbgs()<<"SLP: Adding " << **it << " to MustScalarize because multiple lane use it: " << Lane << " and " << LaneMap[*I] << ".\n"); break; @@ -311,14 +348,6 @@ void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) { if (!I || Opcode != I->getOpcode()) return; } - // Mark instructions with multiple users. - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast<Instruction>(VL[i]); - // Remember to check if all of the users of this instr are vectorized - // within our tree. - if (I && I->getNumUses() > 1) MultiUserVals.insert(I); - } - for (int i = 0, e = VL.size(); i < e; ++i) { // Check that the instruction is only used within // one lane. @@ -327,6 +356,19 @@ void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) { LaneMap[VL[i]] = i; } + // Mark instructions with multiple users. + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *I = dyn_cast<Instruction>(VL[i]); + // Remember to check if all of the users of this instr are vectorized + // within our tree. At depth zero we have no local users, only external + // users that we don't care about. + if (Depth && I && I->getNumUses() > 1) { + DEBUG(dbgs()<<"SLP: Adding to MultiUserVals " + "because it has multiple users:" << *I << " \n"); + MultiUserVals.insert(I); + } + } + switch (Opcode) { case Instruction::ZExt: case Instruction::SExt: @@ -440,11 +482,9 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = InstrIdx[VL0]; - for (unsigned i = 1, e = VL.size(); i < e; ++i ) - MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); - + int MaxIdx = getLastIndex(VL, VL.size()); Instruction *Last = InstrVec[MaxIdx]; + for (unsigned i = 0, e = VL.size(); i < e; ++i ) { if (VL[i] == Last) continue; Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last); @@ -456,6 +496,13 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { } } + // Calculate the extract cost. + unsigned ExternalUserExtractCost = 0; + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (MustExtract.count(VL[i])) + ExternalUserExtractCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + switch (Opcode) { case Instruction::ZExt: case Instruction::SExt: @@ -469,7 +516,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - int Cost = 0; + int Cost = ExternalUserExtractCost; ValueList Operands; Type *SrcTy = VL0->getOperand(0)->getType(); // Prepare the operand vector. @@ -510,7 +557,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - int Cost = 0; + int Cost = ExternalUserExtractCost; // Calculate the cost of all of the operands. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; @@ -540,7 +587,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { int ScalarLdCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - return VecLdCost - ScalarLdCost; + return VecLdCost - ScalarLdCost + ExternalUserExtractCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. @@ -556,7 +603,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { } int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); - return TotalCost; + return TotalCost + ExternalUserExtractCost; } default: // Unable to vectorize unknown instructions. @@ -564,15 +611,40 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) { } } -Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) { +int BoUpSLP::getLastIndex(ArrayRef<Value *> VL, unsigned VF) { int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; for (unsigned i = 0; i < VF; ++i ) MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); - return InstrVec[MaxIdx + 1]; + return MaxIdx; +} + +int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF) { + // Find the first user of the values. + int FirstUser = InstrVec.size(); + for (unsigned i = 0; i < VF; ++i) { + for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); + U != UE; ++U) { + Instruction *Instr = dyn_cast<Instruction>(*U); + if (!Instr || Instr->getParent() != BB) + continue; + + FirstUser = std::min(FirstUser, InstrIdx[Instr]); + } + } + return FirstUser; +} + +int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) { + assert(I->getParent() == BB && "Invalid parent for instruction I"); + assert(J->getParent() == BB && "Invalid parent for instruction J"); + return std::max(InstrIdx[I],InstrIdx[J]); +} + +Instruction *BoUpSLP::getInsertionPoint(unsigned Index) { + return InstrVec[Index + 1]; } Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) { - IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements())); Value *Vec = UndefValue::get(Ty); for (unsigned i=0; i < Ty->getNumElements(); ++i) { // Generate the 'InsertElement' instruction. @@ -583,15 +655,51 @@ Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) { GatherInstructions.push_back(Vec); } + for (unsigned i = 0; i < Ty->getNumElements(); ++i) + VectorizedValues[VL[i]] = Vec; + return Vec; } Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) { Value *V = vectorizeTree_rec(VL, VF); + + int LastInstrIdx = getLastIndex(VL, VL.size()); + for (SetVector<Value*>::iterator it = MustExtract.begin(), + e = MustExtract.end(); it != e; ++it) { + Instruction *I = cast<Instruction>(*it); + + // This is a scalarized value, so we can use the original value. + // No need to extract from the vector. + if (!LaneMap.count(I)) + continue; + + Value *Vec = VectorizedValues[I]; + // We decided not to vectorize I because one of its users was not + // vectorizerd. This is okay. + if (!Vec) + continue; + + Value *Idx = Builder.getInt32(LaneMap[I]); + Value *Extract = Builder.CreateExtractElement(Vec, Idx); + bool Replaced = false; + for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U != UE; + ++U) { + Instruction *UI = cast<Instruction>(*U); + if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx) + UI->replaceUsesOfWith(I ,Extract); + Replaced = true; + } + assert(Replaced && "Must replace at least one outside user"); + (void)Replaced; + } + // We moved some instructions around. We have to number them again // before we can do any analysis. numberInstructions(); MustScalarize.clear(); + MustExtract.clear(); + VectorizedValues.clear(); return V; } @@ -614,19 +722,27 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { } // Check that this is a simple vector constant. - if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); + if (AllConst || AllSameScalar) + return Scalarize(VL, VecTy); // Scalarize unknown structures. Instruction *VL0 = dyn_cast<Instruction>(VL[0]); - if (!VL0) return Scalarize(VL, VecTy); + if (!VL0) + return Scalarize(VL, VecTy); - if (VectorizedValues.count(VL0)) return VectorizedValues[VL0]; + if (VectorizedValues.count(VL0)) { + Value * Vec = VectorizedValues[VL0]; + for (int i = 0; i < VF; ++i) + VectorizedValues[VL[i]] = Vec; + return Vec; + } unsigned Opcode = VL0->getOpcode(); for (unsigned i = 0, e = VF; i < e; ++i) { Instruction *I = dyn_cast<Instruction>(VL[i]); // If not all of the instructions are identical then we have to scalarize. - if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy); + if (!I || Opcode != I->getOpcode()) + return Scalarize(VL, VecTy); } switch (Opcode) { @@ -646,10 +762,12 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { for (int i = 0; i < VF; ++i) INVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); Value *InVec = vectorizeTree_rec(INVL, VF); - IRBuilder<> Builder(GetLastInstr(VL, VF)); CastInst *CI = dyn_cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - VectorizedValues[VL0] = V; + + for (int i = 0; i < VF; ++i) + VectorizedValues[VL[i]] = V; + return V; } case Instruction::Add: @@ -672,16 +790,18 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { case Instruction::Xor: { ValueList LHSVL, RHSVL; for (int i = 0; i < VF; ++i) { - RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); - LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); + LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); + RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); } - Value *RHS = vectorizeTree_rec(RHSVL, VF); Value *LHS = vectorizeTree_rec(LHSVL, VF); - IRBuilder<> Builder(GetLastInstr(VL, VF)); + Value *RHS = vectorizeTree_rec(RHSVL, VF); BinaryOperator *BinOp = cast<BinaryOperator>(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS); - VectorizedValues[VL0] = V; + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS,RHS); + + for (int i = 0; i < VF; ++i) + VectorizedValues[VL[i]] = V; + return V; } case Instruction::Load: { @@ -693,12 +813,18 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { if (!isConsecutiveAccess(VL[i-1], VL[i])) return Scalarize(VL, VecTy); - IRBuilder<> Builder(GetLastInstr(VL, VF)); - Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo()); - LI = Builder.CreateLoad(VecPtr); + // Loads are inserted at the head of the tree because we don't want to sink + // them all the way down past store instructions. + Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size())); + IRBuilder<> LoadBuilder(Loc); + Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo()); + LI = LoadBuilder.CreateLoad(VecPtr); LI->setAlignment(Alignment); - VectorizedValues[VL0] = LI; + + for (int i = 0; i < VF; ++i) + VectorizedValues[VL[i]] = LI; + return LI; } case Instruction::Store: { @@ -710,8 +836,6 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); Value *VecValue = vectorizeTree_rec(ValueOp, VF); - - IRBuilder<> Builder(GetLastInstr(VL, VF)); Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); @@ -721,9 +845,7 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) { return 0; } default: - Value *S = Scalarize(VL, VecTy); - VectorizedValues[VL0] = S; - return S; + return Scalarize(VL, VecTy); } } diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h index 5456c6c..d41d2ed 100644 --- a/lib/Transforms/Vectorize/VecUtils.h +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -16,9 +16,11 @@ #define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/IRBuilder.h" #include <vector> namespace llvm { @@ -107,9 +109,19 @@ private: /// \returns the pointer to the barrier instruction if we can't sink. Value *isUnsafeToSink(Instruction *Src, Instruction *Dst); - /// \returns the instruction that appears last in the BB from \p VL. + /// \returns the index of the last instrucion in the BB from \p VL. /// Only consider the first \p VF elements. - Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF); + int getLastIndex(ArrayRef<Value *> VL, unsigned VF); + + /// \returns the index of the first User of \p VL. + /// Only consider the first \p VF elements. + int getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF); + + /// \returns the instruction \p I or \p Jt hat appears last in the BB . + int getLastIndex(Instruction *I, Instruction *J); + + /// \returns the insertion point for \p Index. + Instruction *getInsertionPoint(unsigned Index); /// \returns a vector from a collection of scalars in \p VL. Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty); @@ -127,15 +139,20 @@ private: /// NOTICE: The vectorization methods also use this set. ValueSet MustScalarize; + /// Contains values that have users outside of the vectorized graph. + /// We need to generate extract instructions for these values. + /// NOTICE: The vectorization methods also use this set. + SetVector<Value*> MustExtract; + /// Contains a list of values that are used outside the current tree. This /// set must be reset between runs. - ValueSet MultiUserVals; + SetVector<Value*> MultiUserVals; /// Maps values in the tree to the vector lanes that uses them. This map must /// be reset between runs of getCost. std::map<Value*, int> LaneMap; /// A list of instructions to ignore while sinking /// memory instructions. This map must be reset between runs of getCost. - SmallPtrSet<Value *, 8> MemBarrierIgnoreList; + ValueSet MemBarrierIgnoreList; // -- Containers that are used during vectorizeTree -- // @@ -150,6 +167,9 @@ private: /// Iterating over this list is faster than calling LICM. ValueList GatherInstructions; + /// Instruction builder to construct the vectorized tree. + IRBuilder<> Builder; + // Analysis and block reference. BasicBlock *BB; ScalarEvolution *SE; |
