diff options
Diffstat (limited to 'lib/CodeGen/CodeGenPrepare.cpp')
-rw-r--r-- | lib/CodeGen/CodeGenPrepare.cpp | 1139 |
1 files changed, 992 insertions, 147 deletions
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 8d20848..c0d7dca 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -30,20 +31,22 @@ #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Statepoint.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/ValueMap.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SimplifyLibCalls.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -70,6 +73,10 @@ static cl::opt<bool> DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); +static cl::opt<bool> + DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false), + cl::desc("Disable GC optimizations in CodeGenPrepare")); + static cl::opt<bool> DisableSelectToBranch( "disable-cgp-select2branch", cl::Hidden, cl::init(false), cl::desc("Disable select to branch conversion.")); @@ -90,6 +97,16 @@ static cl::opt<bool> StressStoreExtract( "stress-cgp-store-extract", cl::Hidden, cl::init(false), cl::desc("Stress test store(extract) optimizations in CodeGenPrepare")); +static cl::opt<bool> DisableExtLdPromotion( + "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), + cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in " + "CodeGenPrepare")); + +static cl::opt<bool> StressExtLdPromotion( + "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false), + cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) " + "optimization in CodeGenPrepare")); + namespace { typedef SmallPtrSet<Instruction *, 16> SetOfInstrs; struct TypeIsSExt { @@ -98,6 +115,7 @@ struct TypeIsSExt { TypeIsSExt(Type *Ty, bool IsSExt) : Ty(Ty), IsSExt(IsSExt) {} }; typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; +class TypePromotionTransaction; class CodeGenPrepare : public FunctionPass { /// TLI - Keep a pointer of a TargetLowering to consult for determining @@ -143,8 +161,8 @@ typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfo>(); - AU.addRequired<TargetTransformInfo>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } private: @@ -152,12 +170,12 @@ typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; bool EliminateMostlyEmptyBlocks(Function &F); bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void EliminateMostlyEmptyBlock(BasicBlock *BB); - bool OptimizeBlock(BasicBlock &BB); - bool OptimizeInst(Instruction *I); + bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT); + bool OptimizeInst(Instruction *I, bool& ModifiedDT); bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy); bool OptimizeInlineAsmInst(CallInst *CS); - bool OptimizeCallInst(CallInst *CI); - bool MoveExtToFormExtLoad(Instruction *I); + bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT); + bool MoveExtToFormExtLoad(Instruction *&I); bool OptimizeExtUses(Instruction *I); bool OptimizeSelectInst(SelectInst *SI); bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI); @@ -165,6 +183,12 @@ typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; bool DupRetToEnableTailCallOpts(BasicBlock *BB); bool PlaceDbgValues(Function &F); bool sinkAndCmp(Function &F); + bool ExtLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, + Instruction *&Inst, + const SmallVectorImpl<Instruction *> &Exts, + unsigned CreatedInst); + bool splitBranchCondition(Function &F); + bool simplifyOffsetableRelocate(Instruction &I); }; } @@ -187,14 +211,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) { ModifiedDT = false; if (TM) - TLI = TM->getSubtargetImpl()->getTargetLowering(); - TLInfo = &getAnalysis<TargetLibraryInfo>(); - TTI = &getAnalysis<TargetTransformInfo>(); + TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; - OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize); + OptSize = F.hasFnAttribute(Attribute::OptimizeForSize); /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. @@ -218,15 +241,23 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // into a single target instruction, push the mask and compare into branch // users. Do this before OptimizeBlock -> OptimizeInst -> // OptimizeCmpExpression, which perturbs the pattern being searched for. - if (!DisableBranchOpts) + if (!DisableBranchOpts) { EverMadeChange |= sinkAndCmp(F); + EverMadeChange |= splitBranchCondition(F); + } bool MadeChange = true; while (MadeChange) { MadeChange = false; for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = I++; - MadeChange |= OptimizeBlock(*BB); + bool ModifiedDTOnIteration = false; + MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration); + + // Restart BB iteration if the dominator tree of the Function was changed + ModifiedDT |= ModifiedDTOnIteration; + if (ModifiedDTOnIteration) + break; } EverMadeChange |= MadeChange; } @@ -236,9 +267,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (!DisableBranchOpts) { MadeChange = false; SmallPtrSet<BasicBlock*, 8> WorkList; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); - MadeChange |= ConstantFoldTerminator(BB, true); + for (BasicBlock &BB : F) { + SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB)); + MadeChange |= ConstantFoldTerminator(&BB, true); if (!MadeChange) continue; for (SmallVectorImpl<BasicBlock*>::iterator @@ -272,6 +303,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) { EverMadeChange |= MadeChange; } + if (!DisableGCOpts) { + SmallVector<Instruction *, 2> Statepoints; + for (BasicBlock &BB : F) + for (Instruction &I : BB) + if (isStatepoint(I)) + Statepoints.push_back(&I); + for (auto &I : Statepoints) + EverMadeChange |= simplifyOffsetableRelocate(*I); + } + if (ModifiedDT && DT) DT->recalculate(F); @@ -300,7 +341,7 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) { // Remember if SinglePred was the entry block of the function. // If so, we will need to move BB back to the entry position. bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); - MergeBasicBlockIntoOnlyPred(BB, this); + MergeBasicBlockIntoOnlyPred(BB, DT); if (isEntry && BB != &BB->getParent()->getEntryBlock()) BB->moveBefore(&BB->getParent()->getEntryBlock()); @@ -440,7 +481,7 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { // Remember if SinglePred was the entry block of the function. If so, we // will need to move BB back to the entry position. bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); - MergeBasicBlockIntoOnlyPred(DestBB, this); + MergeBasicBlockIntoOnlyPred(DestBB, DT); if (isEntry && BB != &BB->getParent()->getEntryBlock()) BB->moveBefore(&BB->getParent()->getEntryBlock()); @@ -495,6 +536,144 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); } +// Computes a map of base pointer relocation instructions to corresponding +// derived pointer relocation instructions given a vector of all relocate calls +static void computeBaseDerivedRelocateMap( + const SmallVectorImpl<User *> &AllRelocateCalls, + DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> & + RelocateInstMap) { + // Collect information in two maps: one primarily for locating the base object + // while filling the second map; the second map is the final structure holding + // a mapping between Base and corresponding Derived relocate calls + DenseMap<std::pair<unsigned, unsigned>, IntrinsicInst *> RelocateIdxMap; + for (auto &U : AllRelocateCalls) { + GCRelocateOperands ThisRelocate(U); + IntrinsicInst *I = cast<IntrinsicInst>(U); + auto K = std::make_pair(ThisRelocate.basePtrIndex(), + ThisRelocate.derivedPtrIndex()); + RelocateIdxMap.insert(std::make_pair(K, I)); + } + for (auto &Item : RelocateIdxMap) { + std::pair<unsigned, unsigned> Key = Item.first; + if (Key.first == Key.second) + // Base relocation: nothing to insert + continue; + + IntrinsicInst *I = Item.second; + auto BaseKey = std::make_pair(Key.first, Key.first); + IntrinsicInst *Base = RelocateIdxMap[BaseKey]; + if (!Base) + // TODO: We might want to insert a new base object relocate and gep off + // that, if there are enough derived object relocates. + continue; + RelocateInstMap[Base].push_back(I); + } +} + +// Accepts a GEP and extracts the operands into a vector provided they're all +// small integer constants +static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP, + SmallVectorImpl<Value *> &OffsetV) { + for (unsigned i = 1; i < GEP->getNumOperands(); i++) { + // Only accept small constant integer operands + auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i)); + if (!Op || Op->getZExtValue() > 20) + return false; + } + + for (unsigned i = 1; i < GEP->getNumOperands(); i++) + OffsetV.push_back(GEP->getOperand(i)); + return true; +} + +// Takes a RelocatedBase (base pointer relocation instruction) and Targets to +// replace, computes a replacement, and affects it. +static bool +simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase, + const SmallVectorImpl<IntrinsicInst *> &Targets) { + bool MadeChange = false; + for (auto &ToReplace : Targets) { + GCRelocateOperands MasterRelocate(RelocatedBase); + GCRelocateOperands ThisRelocate(ToReplace); + + assert(ThisRelocate.basePtrIndex() == MasterRelocate.basePtrIndex() && + "Not relocating a derived object of the original base object"); + if (ThisRelocate.basePtrIndex() == ThisRelocate.derivedPtrIndex()) { + // A duplicate relocate call. TODO: coalesce duplicates. + continue; + } + + Value *Base = ThisRelocate.basePtr(); + auto Derived = dyn_cast<GetElementPtrInst>(ThisRelocate.derivedPtr()); + if (!Derived || Derived->getPointerOperand() != Base) + continue; + + SmallVector<Value *, 2> OffsetV; + if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV)) + continue; + + // Create a Builder and replace the target callsite with a gep + IRBuilder<> Builder(ToReplace); + Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); + Value *Replacement = + Builder.CreateGEP(RelocatedBase, makeArrayRef(OffsetV)); + Instruction *ReplacementInst = cast<Instruction>(Replacement); + ReplacementInst->removeFromParent(); + ReplacementInst->insertAfter(RelocatedBase); + Replacement->takeName(ToReplace); + ToReplace->replaceAllUsesWith(Replacement); + ToReplace->eraseFromParent(); + + MadeChange = true; + } + return MadeChange; +} + +// Turns this: +// +// %base = ... +// %ptr = gep %base + 15 +// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) +// %base' = relocate(%tok, i32 4, i32 4) +// %ptr' = relocate(%tok, i32 4, i32 5) +// %val = load %ptr' +// +// into this: +// +// %base = ... +// %ptr = gep %base + 15 +// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr) +// %base' = gc.relocate(%tok, i32 4, i32 4) +// %ptr' = gep %base' + 15 +// %val = load %ptr' +bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { + bool MadeChange = false; + SmallVector<User *, 2> AllRelocateCalls; + + for (auto *U : I.users()) + if (isGCRelocate(dyn_cast<Instruction>(U))) + // Collect all the relocate calls associated with a statepoint + AllRelocateCalls.push_back(U); + + // We need atleast one base pointer relocation + one derived pointer + // relocation to mangle + if (AllRelocateCalls.size() < 2) + return false; + + // RelocateInstMap is a mapping from the base relocate instruction to the + // corresponding derived relocate instructions + DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> RelocateInstMap; + computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap); + if (RelocateInstMap.empty()) + return false; + + for (auto &Item : RelocateInstMap) + // Item.first is the RelocatedBase to offset against + // Item.second is the vector of Targets to replace + MadeChange = simplifyRelocatesOffABase(Item.first, Item.second); + return MadeChange; +} + /// SinkCast - Sink the specified cast instruction into its user blocks static bool SinkCast(CastInst *CI) { BasicBlock *DefBB = CI->getParent(); @@ -822,23 +1001,211 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, return MadeChange; } -namespace { -class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls { -protected: - void replaceCall(Value *With) override { - CI->replaceAllUsesWith(With); - CI->eraseFromParent(); +// ScalarizeMaskedLoad() translates masked load intrinsic, like +// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, +// <16 x i1> %mask, <16 x i32> %passthru) +// to a chain of basic blocks, whith loading element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.load, label %else +// +//cond.load: ; preds = %0 +// %4 = getelementptr i32* %1, i32 0 +// %5 = load i32* %4 +// %6 = insertelement <16 x i32> undef, i32 %5, i32 0 +// br label %else +// +//else: ; preds = %0, %cond.load +// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] +// %7 = extractelement <16 x i1> %mask, i32 1 +// %8 = icmp eq i1 %7, true +// br i1 %8, label %cond.load1, label %else2 +// +//cond.load1: ; preds = %else +// %9 = getelementptr i32* %1, i32 1 +// %10 = load i32* %9 +// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 +// br label %else2 +// +//else2: ; preds = %else, %cond.load1 +// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] +// %12 = extractelement <16 x i1> %mask, i32 2 +// %13 = icmp eq i1 %12, true +// br i1 %13, label %cond.load4, label %else5 +// +static void ScalarizeMaskedLoad(CallInst *CI) { + Value *Ptr = CI->getArgOperand(0); + Value *Src0 = CI->getArgOperand(3); + Value *Mask = CI->getArgOperand(2); + VectorType *VecType = dyn_cast<VectorType>(CI->getType()); + Type *EltTy = VecType->getElementType(); + + assert(VecType && "Unexpected return type of masked load intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + unsigned VectorWidth = VecType->getNumElements(); + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_load = icmp eq i1 %mask_1, true + // br i1 %to_load, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx)); + LoadInst* Load = Builder.CreateLoad(Gep, false); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; } - bool isFoldable(unsigned SizeCIOp, unsigned, bool) const override { - if (ConstantInt *SizeCI = - dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) - return SizeCI->isAllOnesValue(); - return false; + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// ScalarizeMaskedStore() translates masked store intrinsic, like +// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, +// <16 x i1> %mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.store, label %else +// +// cond.store: ; preds = %0 +// %4 = extractelement <16 x i32> %val, i32 0 +// %5 = getelementptr i32* %1, i32 0 +// store i32 %4, i32* %5 +// br label %else +// +// else: ; preds = %0, %cond.store +// %6 = extractelement <16 x i1> %mask, i32 1 +// %7 = icmp eq i1 %6, true +// br i1 %7, label %cond.store1, label %else2 +// +// cond.store1: ; preds = %else +// %8 = extractelement <16 x i32> %val, i32 1 +// %9 = getelementptr i32* %1, i32 1 +// store i32 %8, i32* %9 +// br label %else2 +// . . . +static void ScalarizeMaskedStore(CallInst *CI) { + Value *Ptr = CI->getArgOperand(1); + Value *Src = CI->getArgOperand(0); + Value *Mask = CI->getArgOperand(3); + + VectorType *VecType = dyn_cast<VectorType>(Src->getType()); + Type *EltTy = VecType->getElementType(); + + assert(VecType && "Unexpected data type in masked store intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + + unsigned VectorWidth = VecType->getNumElements(); + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_store = icmp eq i1 %mask_1, true + // br i1 %to_load, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %OneElt = extractelement <16 x i32> %Src, i32 Idx + // %EltAddr = getelementptr i32* %1, i32 0 + // %store i32 %OneElt, i32* %EltAddr + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateStore(OneElt, Gep); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; } -}; -} // end anonymous namespace + CI->eraseFromParent(); +} -bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { +bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); // Lower inline assembly if we can. @@ -858,38 +1225,60 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { return true; } - // Lower all uses of llvm.objectsize.* IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); - if (II && II->getIntrinsicID() == Intrinsic::objectsize) { - bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); - Type *ReturnTy = CI->getType(); - Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); - - // Substituting this can cause recursive simplifications, which can - // invalidate our iterator. Use a WeakVH to hold onto it in case this - // happens. - WeakVH IterHandle(CurInstIterator); + if (II) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::objectsize: { + // Lower all uses of llvm.objectsize.* + bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); + Type *ReturnTy = CI->getType(); + Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); + + // Substituting this can cause recursive simplifications, which can + // invalidate our iterator. Use a WeakVH to hold onto it in case this + // happens. + WeakVH IterHandle(CurInstIterator); + + replaceAndRecursivelySimplify(CI, RetVal, + TLI ? TLI->getDataLayout() : nullptr, + TLInfo, ModifiedDT ? nullptr : DT); - replaceAndRecursivelySimplify(CI, RetVal, - TLI ? TLI->getDataLayout() : nullptr, - TLInfo, ModifiedDT ? nullptr : DT); - - // If the iterator instruction was recursively deleted, start over at the - // start of the block. - if (IterHandle != CurInstIterator) { - CurInstIterator = BB->begin(); - SunkAddrs.clear(); + // If the iterator instruction was recursively deleted, start over at the + // start of the block. + if (IterHandle != CurInstIterator) { + CurInstIterator = BB->begin(); + SunkAddrs.clear(); + } + return true; + } + case Intrinsic::masked_load: { + // Scalarize unsupported vector masked load + if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) { + ScalarizeMaskedLoad(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_store: { + if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) { + ScalarizeMaskedStore(CI); + ModifiedDT = true; + return true; + } + return false; + } } - return true; - } - if (II && TLI) { - SmallVector<Value*, 2> PtrOps; - Type *AccessTy; - if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy)) - while (!PtrOps.empty()) - if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy)) - return true; + if (TLI) { + SmallVector<Value*, 2> PtrOps; + Type *AccessTy; + if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy)) + while (!PtrOps.empty()) + if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy)) + return true; + } } // From here on out we're working with named functions. @@ -901,10 +1290,15 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // Lower all default uses of _chk calls. This is very similar // to what InstCombineCalls does, but here we are only lowering calls - // that have the default "don't know" as the objectsize. Anything else - // should be left alone. - CodeGenPrepareFortifiedLibCalls Simplifier; - return Simplifier.fold(CI, TD, TLInfo); + // to fortified library functions (e.g. __memcpy_chk) that have the default + // "don't know" as the objectsize. Anything else should be left alone. + FortifiedLibCallSimplifier Simplifier(TD, TLInfo, true); + if (Value *V = Simplifier.optimizeCall(CI)) { + CI->replaceAllUsesWith(V); + CI->eraseFromParent(); + return true; + } + return false; } /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return @@ -1561,6 +1955,7 @@ void TypePromotionTransaction::rollback( /// This encapsulates the logic for matching the target-legal addressing modes. class AddressingModeMatcher { SmallVectorImpl<Instruction*> &AddrModeInsts; + const TargetMachine &TM; const TargetLowering &TLI; /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and @@ -1584,13 +1979,15 @@ class AddressingModeMatcher { /// always returns true. bool IgnoreProfitability; - AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI, - const TargetLowering &T, Type *AT, - Instruction *MI, ExtAddrMode &AM, - const SetOfInstrs &InsertedTruncs, + AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI, + const TargetMachine &TM, Type *AT, Instruction *MI, + ExtAddrMode &AM, const SetOfInstrs &InsertedTruncs, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) - : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM), + : AddrModeInsts(AMI), TM(TM), + TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent()) + ->getTargetLowering()), + AccessTy(AT), MemoryInst(MI), AddrMode(AM), InsertedTruncs(InsertedTruncs), PromotedInsts(PromotedInsts), TPT(TPT) { IgnoreProfitability = false; } @@ -1607,13 +2004,13 @@ public: static ExtAddrMode Match(Value *V, Type *AccessTy, Instruction *MemoryInst, SmallVectorImpl<Instruction*> &AddrModeInsts, - const TargetLowering &TLI, + const TargetMachine &TM, const SetOfInstrs &InsertedTruncs, InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT) { ExtAddrMode Result; - bool Success = AddressingModeMatcher(AddrModeInsts, TLI, AccessTy, + bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, MemoryInst, Result, InsertedTruncs, PromotedInsts, TPT).MatchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -1718,6 +2115,23 @@ static bool MightBeFoldableInst(Instruction *I) { } } +/// \brief Check whether or not \p Val is a legal instruction for \p TLI. +/// \note \p Val is assumed to be the product of some type promotion. +/// Therefore if \p Val has an undefined state in \p TLI, this is assumed +/// to be legal, as the non-promoted value would have had the same state. +static bool isPromotedInstructionLegal(const TargetLowering &TLI, Value *Val) { + Instruction *PromotedInst = dyn_cast<Instruction>(Val); + if (!PromotedInst) + return false; + int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode()); + // If the ISDOpcode is undefined, it was undefined before the promotion. + if (!ISDOpcode) + return true; + // Otherwise, check if the promoted instruction is legal or not. + return TLI.isOperationLegalOrCustom( + ISDOpcode, TLI.getValueType(PromotedInst->getType())); +} + /// \brief Hepler class to perform type promotion. class TypePromotionHelper { /// \brief Utility function to check whether or not a sign or zero extension @@ -1747,46 +2161,59 @@ class TypePromotionHelper { /// \p PromotedInsts maps the instructions to their type before promotion. /// \p CreatedInsts[out] contains how many non-free instructions have been /// created to promote the operand of Ext. + /// Newly added extensions are inserted in \p Exts. + /// Newly added truncates are inserted in \p Truncs. /// Should never be called directly. /// \return The promoted value which is used instead of Ext. - static Value *promoteOperandForTruncAndAnyExt(Instruction *Ext, - TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts); + static Value *promoteOperandForTruncAndAnyExt( + Instruction *Ext, TypePromotionTransaction &TPT, + InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, + SmallVectorImpl<Instruction *> *Exts, + SmallVectorImpl<Instruction *> *Truncs); /// \brief Utility function to promote the operand of \p Ext when this /// operand is promotable and is not a supported trunc or sext. /// \p PromotedInsts maps the instructions to their type before promotion. /// \p CreatedInsts[out] contains how many non-free instructions have been /// created to promote the operand of Ext. + /// Newly added extensions are inserted in \p Exts. + /// Newly added truncates are inserted in \p Truncs. /// Should never be called directly. /// \return The promoted value which is used instead of Ext. - static Value *promoteOperandForOther(Instruction *Ext, - TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts, bool IsSExt); + static Value * + promoteOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT, + InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, + SmallVectorImpl<Instruction *> *Exts, + SmallVectorImpl<Instruction *> *Truncs, bool IsSExt); /// \see promoteOperandForOther. - static Value *signExtendOperandForOther(Instruction *Ext, - TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts) { - return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, true); + static Value * + signExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT, + InstrToOrigTy &PromotedInsts, + unsigned &CreatedInsts, + SmallVectorImpl<Instruction *> *Exts, + SmallVectorImpl<Instruction *> *Truncs) { + return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts, + Truncs, true); } /// \see promoteOperandForOther. - static Value *zeroExtendOperandForOther(Instruction *Ext, - TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts) { - return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, false); + static Value * + zeroExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT, + InstrToOrigTy &PromotedInsts, + unsigned &CreatedInsts, + SmallVectorImpl<Instruction *> *Exts, + SmallVectorImpl<Instruction *> *Truncs) { + return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts, + Truncs, false); } public: /// Type for the utility function that promotes the operand of Ext. typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, - unsigned &CreatedInsts); + InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, + SmallVectorImpl<Instruction *> *Exts, + SmallVectorImpl<Instruction *> *Truncs); /// \brief Given a sign/zero extend instruction \p Ext, return the approriate /// action to promote the operand of \p Ext instead of using Ext. /// \return NULL if no promotable action is possible with the current @@ -1805,6 +2232,12 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, Type *ConsideredExtType, const InstrToOrigTy &PromotedInsts, bool IsSExt) { + // The promotion helper does not know how to deal with vector types yet. + // To be able to fix that, we would need to fix the places where we + // statically extend, e.g., constants and such. + if (Inst->getType()->isVectorTy()) + return false; + // We can always get through zext. if (isa<ZExtInst>(Inst)) return true; @@ -1830,8 +2263,9 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, // Check if we can use this operand in the extension. // If the type is larger than the result type of the extension, // we cannot. - if (OpndVal->getType()->getIntegerBitWidth() > - ConsideredExtType->getIntegerBitWidth()) + if (!OpndVal->getType()->isIntegerTy() || + OpndVal->getType()->getIntegerBitWidth() > + ConsideredExtType->getIntegerBitWidth()) return false; // If the operand of the truncate is not an instruction, we will not have @@ -1896,7 +2330,9 @@ TypePromotionHelper::Action TypePromotionHelper::getAction( Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( llvm::Instruction *SExt, TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts) { + InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, + SmallVectorImpl<Instruction *> *Exts, + SmallVectorImpl<Instruction *> *Truncs) { // By construction, the operand of SExt is an instruction. Otherwise we cannot // get through it and this method should not be called. Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0)); @@ -1922,8 +2358,11 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( // Check if the extension is still needed. Instruction *ExtInst = dyn_cast<Instruction>(ExtVal); - if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) + if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) { + if (ExtInst && Exts) + Exts->push_back(ExtInst); return ExtVal; + } // At this point we have: ext ty opnd to ty. // Reassign the uses of ExtInst to the opnd and remove ExtInst. @@ -1934,7 +2373,9 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt( Value *TypePromotionHelper::promoteOperandForOther( Instruction *Ext, TypePromotionTransaction &TPT, - InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, bool IsSExt) { + InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, + SmallVectorImpl<Instruction *> *Exts, + SmallVectorImpl<Instruction *> *Truncs, bool IsSExt) { // By construction, the operand of Ext is an instruction. Otherwise we cannot // get through it and this method should not be called. Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0)); @@ -1949,6 +2390,8 @@ Value *TypePromotionHelper::promoteOperandForOther( ITrunc->removeFromParent(); // Insert it just after the definition. ITrunc->insertAfter(ExtOpnd); + if (Truncs) + Truncs->push_back(ITrunc); } TPT.replaceAllUsesWith(ExtOpnd, Trunc); @@ -2004,12 +2447,17 @@ Value *TypePromotionHelper::promoteOperandForOther( if (!ExtForOpnd) { // If yes, create a new one. DEBUG(dbgs() << "More operands to ext\n"); - ExtForOpnd = - cast<Instruction>(IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType()) - : TPT.createZExt(Ext, Opnd, Ext->getType())); + Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType()) + : TPT.createZExt(Ext, Opnd, Ext->getType()); + if (!isa<Instruction>(ValForExtOpnd)) { + TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd); + continue; + } + ExtForOpnd = cast<Instruction>(ValForExtOpnd); ++CreatedInsts; } - + if (Exts) + Exts->push_back(ExtForOpnd); TPT.setOperand(ExtForOpnd, 0, Opnd); // Move the sign extension before the insertion point. @@ -2047,16 +2495,7 @@ AddressingModeMatcher::IsPromotionProfitable(unsigned MatchedSize, // The promotion is neutral but it may help folding the sign extension in // loads for instance. // Check that we did not create an illegal instruction. - Instruction *PromotedInst = dyn_cast<Instruction>(PromotedOperand); - if (!PromotedInst) - return false; - int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode()); - // If the ISDOpcode is undefined, it was undefined before the promotion. - if (!ISDOpcode) - return true; - // Otherwise, check if the promoted instruction is legal or not. - return TLI.isOperationLegalOrCustom( - ISDOpcode, TLI.getValueType(PromotedInst->getType())); + return isPromotedInstructionLegal(TLI, PromotedOperand); } /// MatchOperationAddr - Given an instruction or constant expr, see if we can @@ -2250,7 +2689,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); unsigned CreatedInsts = 0; - Value *PromotedOperand = TPH(Ext, TPT, PromotedInsts, CreatedInsts); + Value *PromotedOperand = + TPH(Ext, TPT, PromotedInsts, CreatedInsts, nullptr, nullptr); // SExt has been moved away. // Thus either it will be rematched later in the recursive calls or it is // gone. Anyway, we must not fold it into the addressing mode at this point. @@ -2374,13 +2814,17 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { /// inline asm call are due to memory operands. If so, return true, otherwise /// return false. static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, - const TargetLowering &TLI) { - TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); + const TargetMachine &TM) { + const Function *F = CI->getParent()->getParent(); + const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering(); + const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo(); + TargetLowering::AsmOperandInfoVector TargetConstraints = + TLI->ParseConstraints(TRI, ImmutableCallSite(CI)); for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; // Compute the constraint code and ConstraintType to use. - TLI.ComputeConstraintToUse(OpInfo, SDValue()); + TLI->ComputeConstraintToUse(OpInfo, SDValue()); // If this asm operand is our Value*, and if it isn't an indirect memory // operand, we can't fold it! @@ -2396,10 +2840,10 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, /// FindAllMemoryUses - Recursively walk all the uses of I until we find a /// memory use. If we find an obviously non-foldable instruction, return true. /// Add the ultimately found memory instructions to MemoryUses. -static bool FindAllMemoryUses(Instruction *I, - SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses, - SmallPtrSetImpl<Instruction*> &ConsideredInsts, - const TargetLowering &TLI) { +static bool FindAllMemoryUses( + Instruction *I, + SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses, + SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; @@ -2429,12 +2873,12 @@ static bool FindAllMemoryUses(Instruction *I, if (!IA) return true; // If this is a memory operand, we're cool, otherwise bail out. - if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) + if (!IsOperandAMemoryOperand(CI, IA, I, TM)) return true; continue; } - if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI)) + if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM)) return true; } @@ -2522,7 +2966,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // uses. SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; SmallPtrSet<Instruction*, 16> ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of @@ -2547,7 +2991,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode Result; TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy, + AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, MemoryInst, Result, InsertedTruncs, PromotedInsts, TPT); Matcher.IgnoreProfitability = true; @@ -2630,7 +3074,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // For non-PHIs, determine the addressing mode being computed. SmallVector<Instruction*, 16> NewAddrModeInsts; ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( - V, AccessTy, MemoryInst, NewAddrModeInsts, *TLI, InsertedTruncsSet, + V, AccessTy, MemoryInst, NewAddrModeInsts, *TM, InsertedTruncsSet, PromotedInsts, TPT); // This check is broken into two cases with very similar code to avoid using @@ -2705,8 +3149,10 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, << *MemoryInst << "\n"); if (SunkAddr->getType() != Addr->getType()) SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); - } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() && - TM && TM->getSubtarget<TargetSubtargetInfo>().useAA())) { + } else if (AddrSinkUsingGEPs || + (!AddrSinkUsingGEPs.getNumOccurrences() && TM && + TM->getSubtargetImpl(*MemoryInst->getParent()->getParent()) + ->useAA())) { // By default, we use the GEP-based method when AA is used later. This // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities. DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -2929,8 +3375,10 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { bool MadeChange = false; + const TargetRegisterInfo *TRI = + TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo(); TargetLowering::AsmOperandInfoVector - TargetConstraints = TLI->ParseConstraints(CS); + TargetConstraints = TLI->ParseConstraints(TRI, CS); unsigned ArgNo = 0; for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; @@ -2949,26 +3397,186 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { return MadeChange; } +/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or +/// sign extensions. +static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { + assert(!Inst->use_empty() && "Input must have at least one use"); + const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin()); + bool IsSExt = isa<SExtInst>(FirstUser); + Type *ExtTy = FirstUser->getType(); + for (const User *U : Inst->users()) { + const Instruction *UI = cast<Instruction>(U); + if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI))) + return false; + Type *CurTy = UI->getType(); + // Same input and output types: Same instruction after CSE. + if (CurTy == ExtTy) + continue; + + // If IsSExt is true, we are in this situation: + // a = Inst + // b = sext ty1 a to ty2 + // c = sext ty1 a to ty3 + // Assuming ty2 is shorter than ty3, this could be turned into: + // a = Inst + // b = sext ty1 a to ty2 + // c = sext ty2 b to ty3 + // However, the last sext is not free. + if (IsSExt) + return false; + + // This is a ZExt, maybe this is free to extend from one type to another. + // In that case, we would not account for a different use. + Type *NarrowTy; + Type *LargeTy; + if (ExtTy->getScalarType()->getIntegerBitWidth() > + CurTy->getScalarType()->getIntegerBitWidth()) { + NarrowTy = CurTy; + LargeTy = ExtTy; + } else { + NarrowTy = ExtTy; + LargeTy = CurTy; + } + + if (!TLI.isZExtFree(NarrowTy, LargeTy)) + return false; + } + // All uses are the same or can be derived from one another for free. + return true; +} + +/// \brief Try to form ExtLd by promoting \p Exts until they reach a +/// load instruction. +/// If an ext(load) can be formed, it is returned via \p LI for the load +/// and \p Inst for the extension. +/// Otherwise LI == nullptr and Inst == nullptr. +/// When some promotion happened, \p TPT contains the proper state to +/// revert them. +/// +/// \return true when promoting was necessary to expose the ext(load) +/// opportunity, false otherwise. +/// +/// Example: +/// \code +/// %ld = load i32* %addr +/// %add = add nuw i32 %ld, 4 +/// %zext = zext i32 %add to i64 +/// \endcode +/// => +/// \code +/// %ld = load i32* %addr +/// %zext = zext i32 %ld to i64 +/// %add = add nuw i64 %zext, 4 +/// \encode +/// Thanks to the promotion, we can match zext(load i32*) to i64. +bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT, + LoadInst *&LI, Instruction *&Inst, + const SmallVectorImpl<Instruction *> &Exts, + unsigned CreatedInsts = 0) { + // Iterate over all the extensions to see if one form an ext(load). + for (auto I : Exts) { + // Check if we directly have ext(load). + if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) { + Inst = I; + // No promotion happened here. + return false; + } + // Check whether or not we want to do any promotion. + if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) + continue; + // Get the action to perform the promotion. + TypePromotionHelper::Action TPH = TypePromotionHelper::getAction( + I, InsertedTruncsSet, *TLI, PromotedInsts); + // Check if we can promote. + if (!TPH) + continue; + // Save the current state. + TypePromotionTransaction::ConstRestorationPt LastKnownGood = + TPT.getRestorationPoint(); + SmallVector<Instruction *, 4> NewExts; + unsigned NewCreatedInsts = 0; + // Promote. + Value *PromotedVal = + TPH(I, TPT, PromotedInsts, NewCreatedInsts, &NewExts, nullptr); + assert(PromotedVal && + "TypePromotionHelper should have filtered out those cases"); + + // We would be able to merge only one extension in a load. + // Therefore, if we have more than 1 new extension we heuristically + // cut this search path, because it means we degrade the code quality. + // With exactly 2, the transformation is neutral, because we will merge + // one extension but leave one. However, we optimistically keep going, + // because the new extension may be removed too. + unsigned TotalCreatedInsts = CreatedInsts + NewCreatedInsts; + if (!StressExtLdPromotion && + (TotalCreatedInsts > 1 || + !isPromotedInstructionLegal(*TLI, PromotedVal))) { + // The promotion is not profitable, rollback to the previous state. + TPT.rollback(LastKnownGood); + continue; + } + // The promotion is profitable. + // Check if it exposes an ext(load). + (void)ExtLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInsts); + if (LI && (StressExtLdPromotion || NewCreatedInsts == 0 || + // If we have created a new extension, i.e., now we have two + // extensions. We must make sure one of them is merged with + // the load, otherwise we may degrade the code quality. + (LI->hasOneUse() || hasSameExtUse(LI, *TLI)))) + // Promotion happened. + return true; + // If this does not help to expose an ext(load) then, rollback. + TPT.rollback(LastKnownGood); + } + // None of the extension can form an ext(load). + LI = nullptr; + Inst = nullptr; + return false; +} + /// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same /// basic block as the load, unless conditions are unfavorable. This allows /// SelectionDAG to fold the extend into the load. +/// \p I[in/out] the extension may be modified during the process if some +/// promotions apply. /// -bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { +bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) { + // Try to promote a chain of computation if it allows to form + // an extended load. + TypePromotionTransaction TPT; + TypePromotionTransaction::ConstRestorationPt LastKnownGood = + TPT.getRestorationPoint(); + SmallVector<Instruction *, 1> Exts; + Exts.push_back(I); // Look for a load being extended. - LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0)); - if (!LI) return false; + LoadInst *LI = nullptr; + Instruction *OldExt = I; + bool HasPromoted = ExtLdPromotion(TPT, LI, I, Exts); + if (!LI || !I) { + assert(!HasPromoted && !LI && "If we did not match any load instruction " + "the code must remain the same"); + I = OldExt; + return false; + } // If they're already in the same block, there's nothing to do. - if (LI->getParent() == I->getParent()) + // Make the cheap checks first if we did not promote. + // If we promoted, we need to check if it is indeed profitable. + if (!HasPromoted && LI->getParent() == I->getParent()) return false; + EVT VT = TLI->getValueType(I->getType()); + EVT LoadVT = TLI->getValueType(LI->getType()); + // If the load has other users and the truncate is not free, this probably // isn't worthwhile. - if (!LI->hasOneUse() && - TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) || - !TLI->isTypeLegal(TLI->getValueType(I->getType()))) && - !TLI->isTruncateFree(I->getType(), LI->getType())) + if (!LI->hasOneUse() && TLI && + (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && + !TLI->isTruncateFree(I->getType(), LI->getType())) { + I = OldExt; + TPT.rollback(LastKnownGood); return false; + } // Check whether the target supports casts folded into loads. unsigned LType; @@ -2978,11 +3586,15 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { assert(isa<SExtInst>(I) && "Unexpected ext type!"); LType = ISD::SEXTLOAD; } - if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType()))) + if (TLI && !TLI->isLoadExtLegal(LType, VT, LoadVT)) { + I = OldExt; + TPT.rollback(LastKnownGood); return false; + } // Move the extend into the same block as the load, so that SelectionDAG // can fold it. + TPT.commit(); I->removeFromParent(); I->insertAfter(LI); ++NumExtsMoved; @@ -3512,7 +4124,8 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { isa<UndefValue>(Val) || canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())); } else - assert(0 && "Did you modified shouldPromote and forgot to update this?"); + llvm_unreachable("Did you modified shouldPromote and forgot to update " + "this?"); ToBePromoted->setOperand(U.getOperandNo(), NewVal); } Transition->removeFromParent(); @@ -3575,7 +4188,7 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) { return false; } -bool CodeGenPrepare::OptimizeInst(Instruction *I) { +bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) { if (PHINode *P = dyn_cast<PHINode>(I)) { // It is possible for very late stage optimizations (such as SimplifyCFG) // to introduce PHI nodes too late to be cleaned up. If we detect such a @@ -3654,14 +4267,14 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { GEPI->replaceAllUsesWith(NC); GEPI->eraseFromParent(); ++NumGEPsElim; - OptimizeInst(NC); + OptimizeInst(NC, ModifiedDT); return true; } return false; } if (CallInst *CI = dyn_cast<CallInst>(I)) - return OptimizeCallInst(CI); + return OptimizeCallInst(CI, ModifiedDT); if (SelectInst *SI = dyn_cast<SelectInst>(I)) return OptimizeSelectInst(SI); @@ -3678,14 +4291,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. -bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { +bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) { SunkAddrs.clear(); bool MadeChange = false; CurInstIterator = BB.begin(); - while (CurInstIterator != BB.end()) - MadeChange |= OptimizeInst(CurInstIterator++); - + while (CurInstIterator != BB.end()) { + MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT); + if (ModifiedDT) + return true; + } MadeChange |= DupRetToEnableTailCallOpts(&BB); return MadeChange; @@ -3696,10 +4311,10 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { // find a node corresponding to the value. bool CodeGenPrepare::PlaceDbgValues(Function &F) { bool MadeChange = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + for (BasicBlock &BB : F) { Instruction *PrevNonDbgInst = nullptr; - for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { - Instruction *Insn = BI; ++BI; + for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { + Instruction *Insn = BI++; DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); // Leave dbg.values that refer to an alloca alone. These // instrinsics describe the address of a variable (= the alloca) @@ -3793,3 +4408,233 @@ bool CodeGenPrepare::sinkAndCmp(Function &F) { } return MadeChange; } + +/// \brief Retrieve the probabilities of a conditional branch. Returns true on +/// success, or returns false if no or invalid metadata was found. +static bool extractBranchMetadata(BranchInst *BI, + uint64_t &ProbTrue, uint64_t &ProbFalse) { + assert(BI->isConditional() && + "Looking for probabilities on unconditional branch?"); + auto *ProfileData = BI->getMetadata(LLVMContext::MD_prof); + if (!ProfileData || ProfileData->getNumOperands() != 3) + return false; + + const auto *CITrue = + mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1)); + const auto *CIFalse = + mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2)); + if (!CITrue || !CIFalse) + return false; + + ProbTrue = CITrue->getValue().getZExtValue(); + ProbFalse = CIFalse->getValue().getZExtValue(); + + return true; +} + +/// \brief Scale down both weights to fit into uint32_t. +static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { + uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; + uint32_t Scale = (NewMax / UINT32_MAX) + 1; + NewTrue = NewTrue / Scale; + NewFalse = NewFalse / Scale; +} + +/// \brief Some targets prefer to split a conditional branch like: +/// \code +/// %0 = icmp ne i32 %a, 0 +/// %1 = icmp ne i32 %b, 0 +/// %or.cond = or i1 %0, %1 +/// br i1 %or.cond, label %TrueBB, label %FalseBB +/// \endcode +/// into multiple branch instructions like: +/// \code +/// bb1: +/// %0 = icmp ne i32 %a, 0 +/// br i1 %0, label %TrueBB, label %bb2 +/// bb2: +/// %1 = icmp ne i32 %b, 0 +/// br i1 %1, label %TrueBB, label %FalseBB +/// \endcode +/// This usually allows instruction selection to do even further optimizations +/// and combine the compare with the branch instruction. Currently this is +/// applied for targets which have "cheap" jump instructions. +/// +/// FIXME: Remove the (equivalent?) implementation in SelectionDAG. +/// +bool CodeGenPrepare::splitBranchCondition(Function &F) { + if (!TM || TM->Options.EnableFastISel != true || + !TLI || TLI->isJumpExpensive()) + return false; + + bool MadeChange = false; + for (auto &BB : F) { + // Does this BB end with the following? + // %cond1 = icmp|fcmp|binary instruction ... + // %cond2 = icmp|fcmp|binary instruction ... + // %cond.or = or|and i1 %cond1, cond2 + // br i1 %cond.or label %dest1, label %dest2" + BinaryOperator *LogicOp; + BasicBlock *TBB, *FBB; + if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB))) + continue; + + unsigned Opc; + Value *Cond1, *Cond2; + if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)), + m_OneUse(m_Value(Cond2))))) + Opc = Instruction::And; + else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)), + m_OneUse(m_Value(Cond2))))) + Opc = Instruction::Or; + else + continue; + + if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) || + !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp())) ) + continue; + + DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump()); + + // Create a new BB. + auto *InsertBefore = std::next(Function::iterator(BB)) + .getNodePtrUnchecked(); + auto TmpBB = BasicBlock::Create(BB.getContext(), + BB.getName() + ".cond.split", + BB.getParent(), InsertBefore); + + // Update original basic block by using the first condition directly by the + // branch instruction and removing the no longer needed and/or instruction. + auto *Br1 = cast<BranchInst>(BB.getTerminator()); + Br1->setCondition(Cond1); + LogicOp->eraseFromParent(); + + // Depending on the conditon we have to either replace the true or the false + // successor of the original branch instruction. + if (Opc == Instruction::And) + Br1->setSuccessor(0, TmpBB); + else + Br1->setSuccessor(1, TmpBB); + + // Fill in the new basic block. + auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB); + if (auto *I = dyn_cast<Instruction>(Cond2)) { + I->removeFromParent(); + I->insertBefore(Br2); + } + + // Update PHI nodes in both successors. The original BB needs to be + // replaced in one succesor's PHI nodes, because the branch comes now from + // the newly generated BB (NewBB). In the other successor we need to add one + // incoming edge to the PHI nodes, because both branch instructions target + // now the same successor. Depending on the original branch condition + // (and/or) we have to swap the successors (TrueDest, FalseDest), so that + // we perfrom the correct update for the PHI nodes. + // This doesn't change the successor order of the just created branch + // instruction (or any other instruction). + if (Opc == Instruction::Or) + std::swap(TBB, FBB); + + // Replace the old BB with the new BB. + for (auto &I : *TBB) { + PHINode *PN = dyn_cast<PHINode>(&I); + if (!PN) + break; + int i; + while ((i = PN->getBasicBlockIndex(&BB)) >= 0) + PN->setIncomingBlock(i, TmpBB); + } + + // Add another incoming edge form the new BB. + for (auto &I : *FBB) { + PHINode *PN = dyn_cast<PHINode>(&I); + if (!PN) + break; + auto *Val = PN->getIncomingValueForBlock(&BB); + PN->addIncoming(Val, TmpBB); + } + + // Update the branch weights (from SelectionDAGBuilder:: + // FindMergedConditions). + if (Opc == Instruction::Or) { + // Codegen X | Y as: + // BB1: + // jmp_if_X TBB + // jmp TmpBB + // TmpBB: + // jmp_if_Y TBB + // jmp FBB + // + + // We have flexibility in setting Prob for BB1 and Prob for NewBB. + // The requirement is that + // TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB) + // = TrueProb for orignal BB. + // Assuming the orignal weights are A and B, one choice is to set BB1's + // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice + // assumes that + // TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB. + // Another choice is to assume TrueProb for BB1 equals to TrueProb for + // TmpBB, but the math is more complicated. + uint64_t TrueWeight, FalseWeight; + if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) { + uint64_t NewTrueWeight = TrueWeight; + uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight; + scaleWeights(NewTrueWeight, NewFalseWeight); + Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) + .createBranchWeights(TrueWeight, FalseWeight)); + + NewTrueWeight = TrueWeight; + NewFalseWeight = 2 * FalseWeight; + scaleWeights(NewTrueWeight, NewFalseWeight); + Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) + .createBranchWeights(TrueWeight, FalseWeight)); + } + } else { + // Codegen X & Y as: + // BB1: + // jmp_if_X TmpBB + // jmp FBB + // TmpBB: + // jmp_if_Y TBB + // jmp FBB + // + // This requires creation of TmpBB after CurBB. + + // We have flexibility in setting Prob for BB1 and Prob for TmpBB. + // The requirement is that + // FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB) + // = FalseProb for orignal BB. + // Assuming the orignal weights are A and B, one choice is to set BB1's + // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice + // assumes that + // FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB. + uint64_t TrueWeight, FalseWeight; + if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) { + uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight; + uint64_t NewFalseWeight = FalseWeight; + scaleWeights(NewTrueWeight, NewFalseWeight); + Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) + .createBranchWeights(TrueWeight, FalseWeight)); + + NewTrueWeight = 2 * TrueWeight; + NewFalseWeight = FalseWeight; + scaleWeights(NewTrueWeight, NewFalseWeight); + Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext()) + .createBranchWeights(TrueWeight, FalseWeight)); + } + } + + // Request DOM Tree update. + // Note: No point in getting fancy here, since the DT info is never + // available to CodeGenPrepare and the existing update code is broken + // anyways. + ModifiedDT = true; + + MadeChange = true; + + DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump(); + TmpBB->dump()); + } + return MadeChange; +} |