diff options
Diffstat (limited to 'lib/Transforms/Scalar')
24 files changed, 1805 insertions, 1650 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index a5adb5e..ba214d1 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -57,6 +57,7 @@ bool ADCE::runOnFunction(Function& F) { for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) if (isa<TerminatorInst>(I.getInstructionIterator()) || isa<DbgInfoIntrinsic>(I.getInstructionIterator()) || + isa<LandingPadInst>(I.getInstructionIterator()) || I->mayHaveSideEffects()) { alive.insert(I.getInstructionIterator()); worklist.push_back(I.getInstructionIterator()); @@ -65,7 +66,6 @@ bool ADCE::runOnFunction(Function& F) { // Propagate liveness backwards to operands. while (!worklist.empty()) { Instruction* curr = worklist.pop_back_val(); - for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end(); OI != OE; ++OI) if (Instruction* Inst = dyn_cast<Instruction>(OI)) diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index c223da6..a6f0cf3 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -7,6 +7,7 @@ add_llvm_library(LLVMScalarOpts DCE.cpp DeadStoreElimination.cpp EarlyCSE.cpp + GlobalMerge.cpp GVN.cpp IndVarSimplify.cpp JumpThreading.cpp @@ -29,6 +30,14 @@ add_llvm_library(LLVMScalarOpts SimplifyCFGPass.cpp SimplifyLibCalls.cpp Sink.cpp - TailDuplication.cpp TailRecursionElimination.cpp ) + +add_llvm_library_dependencies(LLVMScalarOpts + LLVMAnalysis + LLVMCore + LLVMInstCombine + LLVMSupport + LLVMTarget + LLVMTransformUtils + ) diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 17beeb5..f8f18b2 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -58,6 +58,7 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); +STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); static cl::opt<bool> DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), @@ -110,6 +111,7 @@ namespace { bool MoveExtToFormExtLoad(Instruction *I); bool OptimizeExtUses(Instruction *I); bool DupRetToEnableTailCallOpts(ReturnInst *RI); + bool PlaceDbgValues(Function &F); }; } @@ -132,6 +134,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // unconditional branch. EverMadeChange |= EliminateMostlyEmptyBlocks(F); + // llvm.dbg.value is far away from the value then iSel may not be able + // handle it properly. iSel will drop llvm.dbg.value if it can not + // find a node corresponding to the value. + EverMadeChange |= PlaceDbgValues(F); + bool MadeChange = true; while (MadeChange) { MadeChange = false; @@ -410,8 +417,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ CastInst *&InsertedCast = InsertedCasts[UserBB]; if (!InsertedCast) { - BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI(); - + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", InsertPt); @@ -467,8 +473,7 @@ static bool OptimizeCmpExpression(CmpInst *CI) { CmpInst *&InsertedCmp = InsertedCmps[UserBB]; if (!InsertedCmp) { - BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI(); - + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); InsertedCmp = CmpInst::Create(CI->getOpcode(), CI->getPredicate(), CI->getOperand(0), @@ -551,22 +556,6 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // From here on out we're working with named functions. if (CI->getCalledFunction() == 0) return false; - // llvm.dbg.value is far away from the value then iSel may not be able - // handle it properly. iSel will drop llvm.dbg.value if it can not - // find a node corresponding to the value. - if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(CI)) - if (Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue())) - if (!VI->isTerminator() && - (DVI->getParent() != VI->getParent() || DT->dominates(DVI, VI))) { - DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); - DVI->removeFromParent(); - if (isa<PHINode>(VI)) - DVI->insertBefore(VI->getParent()->getFirstNonPHI()); - else - DVI->insertAfter(VI); - return true; - } - // We'll need TargetData from here on out. const TargetData *TD = TLI ? TLI->getTargetData() : 0; if (!TD) return false; @@ -746,13 +735,11 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, worklist.pop_back(); // Break use-def graph loops. - if (Visited.count(V)) { + if (!Visited.insert(V)) { Consensus = 0; break; } - Visited.insert(V); - // For a PHI node, push all of its incoming values. if (PHINode *P = dyn_cast<PHINode>(V)) { for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) @@ -763,7 +750,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // For non-PHIs, determine the addressing mode being computed. SmallVector<Instruction*, 16> NewAddrModeInsts; ExtAddrMode NewAddrMode = - AddressingModeMatcher::Match(V, AccessTy,MemoryInst, + AddressingModeMatcher::Match(V, AccessTy, MemoryInst, NewAddrModeInsts, *TLI); // This check is broken into two cases with very similar code to avoid using @@ -822,7 +809,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // Insert this computation right after this user. Since our caller is // scanning from the top of the BB to the bottom, reuse of the expr are // guaranteed to happen later. - BasicBlock::iterator InsertPt = MemoryInst; + IRBuilder<> Builder(MemoryInst); // Now that we determined the addressing expression we want to use and know // that we have to sink it into this block. Check to see if we have already @@ -833,7 +820,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); if (SunkAddr->getType() != Addr->getType()) - SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt); + SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); } else { DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); @@ -850,10 +837,9 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (AddrMode.BaseReg) { Value *V = AddrMode.BaseReg; if (V->getType()->isPointerTy()) - V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt); + V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); if (V->getType() != IntPtrTy) - V = CastInst::CreateIntegerCast(V, IntPtrTy, /*isSigned=*/true, - "sunkaddr", InsertPt); + V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); Result = V; } @@ -863,29 +849,27 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (V->getType() == IntPtrTy) { // done. } else if (V->getType()->isPointerTy()) { - V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt); + V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < cast<IntegerType>(V->getType())->getBitWidth()) { - V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt); + V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); } else { - V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt); + V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr"); } if (AddrMode.Scale != 1) - V = BinaryOperator::CreateMul(V, ConstantInt::get(IntPtrTy, - AddrMode.Scale), - "sunkaddr", InsertPt); + V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), + "sunkaddr"); if (Result) - Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt); + Result = Builder.CreateAdd(Result, V, "sunkaddr"); else Result = V; } // Add in the BaseGV if present. if (AddrMode.BaseGV) { - Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr", - InsertPt); + Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); if (Result) - Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt); + Result = Builder.CreateAdd(Result, V, "sunkaddr"); else Result = V; } @@ -894,7 +878,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (AddrMode.BaseOffs) { Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); if (Result) - Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt); + Result = Builder.CreateAdd(Result, V, "sunkaddr"); else Result = V; } @@ -902,7 +886,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (Result == 0) SunkAddr = Constant::getNullValue(Addr->getType()); else - SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt); + SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr"); } MemoryInst->replaceUsesOfWith(Repl, SunkAddr); @@ -1059,8 +1043,7 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; if (!InsertedTrunc) { - BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI(); - + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt); } @@ -1159,3 +1142,34 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { return MadeChange; } + +// llvm.dbg.value is far away from the value then iSel may not be able +// handle it properly. iSel will drop llvm.dbg.value if it can not +// find a node corresponding to the value. +bool CodeGenPrepare::PlaceDbgValues(Function &F) { + bool MadeChange = false; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + Instruction *PrevNonDbgInst = NULL; + for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { + Instruction *Insn = BI; ++BI; + DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); + if (!DVI) { + PrevNonDbgInst = Insn; + continue; + } + + Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); + if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { + DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); + DVI->removeFromParent(); + if (isa<PHINode>(VI)) + DVI->insertBefore(VI->getParent()->getFirstInsertionPt()); + else + DVI->insertAfter(VI); + MadeChange = true; + ++NumDbgValueMoved; + } + } + } + return MadeChange; +} diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index e6089a9..a593d0f 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -52,18 +52,18 @@ namespace { AA = &getAnalysis<AliasAnalysis>(); MD = &getAnalysis<MemoryDependenceAnalysis>(); DominatorTree &DT = getAnalysis<DominatorTree>(); - + bool Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) // Only check non-dead blocks. Dead blocks may have strange pointer // cycles that will confuse alias analysis. if (DT.isReachableFromEntry(I)) Changed |= runOnBasicBlock(*I); - + AA = 0; MD = 0; return Changed; } - + bool runOnBasicBlock(BasicBlock &BB); bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); @@ -105,34 +105,34 @@ static void DeleteDeadInstruction(Instruction *I, MemoryDependenceAnalysis &MD, SmallPtrSet<Value*, 16> *ValueSet = 0) { SmallVector<Instruction*, 32> NowDeadInsts; - + NowDeadInsts.push_back(I); --NumFastOther; - + // Before we touch this instruction, remove it from memdep! do { Instruction *DeadInst = NowDeadInsts.pop_back_val(); ++NumFastOther; - + // This instruction is dead, zap it, in stages. Start by removing it from // MemDep, which needs to know the operands and needs it to be in the // function. MD.removeInstruction(DeadInst); - + for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { Value *Op = DeadInst->getOperand(op); DeadInst->setOperand(op, 0); - + // If this operand just became dead, add it to the NowDeadInsts list. if (!Op->use_empty()) continue; - + if (Instruction *OpI = dyn_cast<Instruction>(Op)) if (isInstructionTriviallyDead(OpI)) NowDeadInsts.push_back(OpI); } - + DeadInst->eraseFromParent(); - + if (ValueSet) ValueSet->erase(DeadInst); } while (!NowDeadInsts.empty()); } @@ -159,11 +159,13 @@ static bool hasMemoryWrite(Instruction *I) { } /// getLocForWrite - Return a Location stored to by the specified instruction. +/// If isRemovable returns true, this function and getLocForRead completely +/// describe the memory operations for this instruction. static AliasAnalysis::Location getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) return AA.getLocation(SI); - + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) { // memcpy/memmove/memset. AliasAnalysis::Location Loc = AA.getLocationForDest(MI); @@ -174,10 +176,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { return AliasAnalysis::Location(); return Loc; } - + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); if (II == 0) return AliasAnalysis::Location(); - + switch (II->getIntrinsicID()) { default: return AliasAnalysis::Location(); // Unhandled intrinsic. case Intrinsic::init_trampoline: @@ -185,7 +187,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // that we should use the size of the pointee type. This isn't valid for // init.trampoline, which writes more than an i8. if (AA.getTargetData() == 0) return AliasAnalysis::Location(); - + // FIXME: We don't know the size of the trampoline, so we can't really // handle it here. return AliasAnalysis::Location(II->getArgOperand(0)); @@ -198,10 +200,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { /// getLocForRead - Return the location read by the specified "hasMemoryWrite" /// instruction if any. -static AliasAnalysis::Location +static AliasAnalysis::Location getLocForRead(Instruction *Inst, AliasAnalysis &AA) { assert(hasMemoryWrite(Inst) && "Unknown instruction case"); - + // The only instructions that both read and write are the mem transfer // instructions (memcpy/memmove). if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) @@ -213,10 +215,10 @@ getLocForRead(Instruction *Inst, AliasAnalysis &AA) { /// isRemovable - If the value of this instruction and the memory it writes to /// is unused, may we delete this instruction? static bool isRemovable(Instruction *I) { - // Don't remove volatile stores. + // Don't remove volatile/atomic stores. if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return !SI->isVolatile(); - + return SI->isUnordered(); + IntrinsicInst *II = cast<IntrinsicInst>(I); switch (II->getIntrinsicID()) { default: assert(0 && "doesn't pass 'hasMemoryWrite' predicate"); @@ -227,7 +229,7 @@ static bool isRemovable(Instruction *I) { case Intrinsic::init_trampoline: // Always safe to remove init_trampoline. return true; - + case Intrinsic::memset: case Intrinsic::memmove: case Intrinsic::memcpy: @@ -255,14 +257,14 @@ static uint64_t getPointerSize(Value *V, AliasAnalysis &AA) { const TargetData *TD = AA.getTargetData(); if (TD == 0) return AliasAnalysis::UnknownSize; - + if (AllocaInst *A = dyn_cast<AllocaInst>(V)) { // Get size information for the alloca if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize())) return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType()); return AliasAnalysis::UnknownSize; } - + assert(isa<Argument>(V) && "Expected AllocaInst or Argument!"); PointerType *PT = cast<PointerType>(V->getType()); return TD->getTypeAllocSize(PT->getElementType()); @@ -287,7 +289,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, AliasAnalysis &AA) { const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); - + // If the start pointers are the same, we just have to compare sizes to see if // the later store was larger than the earlier store. if (P1 == P2) { @@ -302,33 +304,33 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, return Later.Ptr->getType() == Earlier.Ptr->getType(); return false; } - + // Make sure that the Later size is >= the Earlier size. if (Later.Size < Earlier.Size) return false; return true; } - + // Otherwise, we have to have size information, and the later store has to be // larger than the earlier one. if (Later.Size == AliasAnalysis::UnknownSize || Earlier.Size == AliasAnalysis::UnknownSize || Later.Size <= Earlier.Size || AA.getTargetData() == 0) return false; - + // Check to see if the later store is to the entire object (either a global, // an alloca, or a byval argument). If so, then it clearly overwrites any // other store to the same object. const TargetData &TD = *AA.getTargetData(); - + const Value *UO1 = GetUnderlyingObject(P1, &TD), *UO2 = GetUnderlyingObject(P2, &TD); - + // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. if (UO1 != UO2) return false; - + // If the "Later" store is to a recognizable object, get its size. if (isObjectPointerWithTrustworthySize(UO2)) { uint64_t ObjectSize = @@ -336,26 +338,26 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, if (ObjectSize == Later.Size) return true; } - + // Okay, we have stores to two completely different pointers. Try to // decompose the pointer into a "base + constant_offset" form. If the base // pointers are equal, then we can reason about the two stores. int64_t EarlierOff = 0, LaterOff = 0; const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD); const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD); - + // If the base pointers still differ, we have two completely different stores. if (BP1 != BP2) return false; // The later store completely overlaps the earlier store if: - // + // // 1. Both start at the same offset and the later one's size is greater than // or equal to the earlier one's, or // // |--earlier--| // |-- later --| - // + // // 2. The earlier store has an offset greater than the later offset, but which // still lies completely within the later store. // @@ -373,7 +375,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, /// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a /// memory region into an identical pointer) then it doesn't actually make its -/// input dead in the traditional sense. Consider this case: +/// input dead in the traditional sense. Consider this case: /// /// memcpy(A <- B) /// memcpy(A <- A) @@ -391,10 +393,10 @@ static bool isPossibleSelfRead(Instruction *Inst, // location read. AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA); if (InstReadLoc.Ptr == 0) return false; // Not a reading instruction. - + // If the read and written loc obviously don't alias, it isn't a read. if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false; - + // Okay, 'Inst' may copy over itself. However, we can still remove a the // DepWrite instruction if we can prove that it reads from the same location // as Inst. This handles useful cases like: @@ -404,10 +406,10 @@ static bool isPossibleSelfRead(Instruction *Inst, // aliases, so removing the first memcpy is safe (assuming it writes <= # // bytes as the second one. AliasAnalysis::Location DepReadLoc = getLocForRead(DepWrite, AA); - + if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) return false; - + // If DepWrite doesn't read memory or if we can't prove it is a must alias, // then it can't be considered dead. return true; @@ -420,43 +422,43 @@ static bool isPossibleSelfRead(Instruction *Inst, bool DSE::runOnBasicBlock(BasicBlock &BB) { bool MadeChange = false; - + // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { Instruction *Inst = BBI++; - + // Handle 'free' calls specially. if (CallInst *F = isFreeCall(Inst)) { MadeChange |= HandleFree(F); continue; } - + // If we find something that writes memory, get its memory dependence. if (!hasMemoryWrite(Inst)) continue; MemDepResult InstDep = MD->getDependency(Inst); - + // Ignore any store where we can't find a local dependence. // FIXME: cross-block DSE would be fun. :) - if (InstDep.isNonLocal() || InstDep.isUnknown()) + if (!InstDep.isDef() && !InstDep.isClobber()) continue; - + // If we're storing the same value back to a pointer that we just // loaded from, then the store can be removed. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { if (SI->getPointerOperand() == DepLoad->getPointerOperand() && - SI->getOperand(0) == DepLoad && !SI->isVolatile()) { + SI->getOperand(0) == DepLoad && isRemovable(SI)) { DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); - + // DeleteDeadInstruction can delete the current instruction. Save BBI // in case we need it. WeakVH NextInst(BBI); - + DeleteDeadInstruction(SI, *MD); - + if (NextInst == 0) // Next instruction deleted. BBI = BB.begin(); else if (BBI != BB.begin()) // Revisit this instruction if possible. @@ -467,15 +469,15 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { } } } - + // Figure out what location is being stored to. AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); // If we didn't get a useful location, fail. if (Loc.Ptr == 0) continue; - - while (!InstDep.isNonLocal() && !InstDep.isUnknown()) { + + while (InstDep.isDef() || InstDep.isClobber()) { // Get the memory clobbered by the instruction we depend on. MemDep will // skip any instructions that 'Loc' clearly doesn't interact with. If we // end up depending on a may- or must-aliased load, then we can't optimize @@ -496,12 +498,12 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); - + // Delete the store and now-dead instructions that feed it. DeleteDeadInstruction(DepWrite, *MD); ++NumFastStores; MadeChange = true; - + // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. BBI = Inst; @@ -509,7 +511,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { --BBI; break; } - + // If this is a may-aliased store that is clobbering the store value, we // can keep searching past it for another must-aliased pointer that stores // to the same location. For example, in: @@ -519,20 +521,20 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // we can remove the first store to P even though we don't know if P and Q // alias. if (DepWrite == &BB.front()) break; - + // Can't look past this instruction if it might read 'Loc'. if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) break; - + InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); } } - + // If this block ends in a return, unwind, or unreachable, all allocas are // dead at its end, which means stores to them are also dead. if (BB.getTerminator()->getNumSuccessors() == 0) MadeChange |= handleEndBlock(BB); - + return MadeChange; } @@ -543,18 +545,18 @@ bool DSE::HandleFree(CallInst *F) { MemDepResult Dep = MD->getDependency(F); - while (!Dep.isNonLocal() && !Dep.isUnknown()) { + while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) return MadeChange; - + Value *DepPointer = GetUnderlyingObject(getStoredPointerOperand(Dependency)); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) return MadeChange; - + // DCE instructions only used to calculate that store DeleteDeadInstruction(Dependency, *MD); ++NumFastStores; @@ -567,7 +569,7 @@ bool DSE::HandleFree(CallInst *F) { // free(s); Dep = MD->getDependency(F); }; - + return MadeChange; } @@ -579,28 +581,28 @@ bool DSE::HandleFree(CallInst *F) { /// ret void bool DSE::handleEndBlock(BasicBlock &BB) { bool MadeChange = false; - + // Keep track of all of the stack objects that are dead at the end of the // function. SmallPtrSet<Value*, 16> DeadStackObjects; - + // Find all of the alloca'd pointers in the entry block. BasicBlock *Entry = BB.getParent()->begin(); for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) DeadStackObjects.insert(AI); - + // Treat byval arguments the same, stores to them are dead at the end of the // function. for (Function::arg_iterator AI = BB.getParent()->arg_begin(), AE = BB.getParent()->arg_end(); AI != AE; ++AI) if (AI->hasByValAttr()) DeadStackObjects.insert(AI); - + // Scan the basic block backwards for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){ --BBI; - + // If we find a store, check to see if it points into a dead stack value. if (hasMemoryWrite(BBI) && isRemovable(BBI)) { // See through pointer-to-pointer bitcasts @@ -609,10 +611,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Stores to stack values are valid candidates for removal. if (DeadStackObjects.count(Pointer)) { Instruction *Dead = BBI++; - + DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " << *Dead << "\n Object: " << *Pointer << '\n'); - + // DCE instructions only used to calculate that store. DeleteDeadInstruction(Dead, *MD, &DeadStackObjects); ++NumFastStores; @@ -620,7 +622,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { continue; } } - + // Remove any dead non-memory-mutating instructions. if (isInstructionTriviallyDead(BBI)) { Instruction *Inst = BBI++; @@ -629,55 +631,61 @@ bool DSE::handleEndBlock(BasicBlock &BB) { MadeChange = true; continue; } - + if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) { DeadStackObjects.erase(A); continue; } - + if (CallSite CS = cast<Value>(BBI)) { // If this call does not access memory, it can't be loading any of our // pointers. if (AA->doesNotAccessMemory(CS)) continue; - + // If the call might load from any of our allocas, then any store above // the call is live. SmallVector<Value*, 8> LiveAllocas; for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), E = DeadStackObjects.end(); I != E; ++I) { // See if the call site touches it. - AliasAnalysis::ModRefResult A = + AliasAnalysis::ModRefResult A = AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA)); - + if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref) LiveAllocas.push_back(*I); } - + for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), E = LiveAllocas.end(); I != E; ++I) DeadStackObjects.erase(*I); - + // If all of the allocas were clobbered by the call then we're not going // to find anything else to process. if (DeadStackObjects.empty()) return MadeChange; - + continue; } - + AliasAnalysis::Location LoadedLoc; - + // If we encounter a use of the pointer, it is no longer considered dead if (LoadInst *L = dyn_cast<LoadInst>(BBI)) { + if (!L->isUnordered()) // Be conservative with atomic/volatile load + break; LoadedLoc = AA->getLocation(L); } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) { LoadedLoc = AA->getLocation(V); } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) { LoadedLoc = AA->getLocationForSource(MTI); - } else { - // Not a loading instruction. + } else if (!BBI->mayReadFromMemory()) { + // Instruction doesn't read memory. Note that stores that weren't removed + // above will hit this case. continue; + } else { + // Unknown inst; assume it clobbers everything. + break; } // Remove any allocas from the DeadPointer set that are loaded, as this @@ -689,7 +697,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (DeadStackObjects.empty()) break; } - + return MadeChange; } @@ -703,14 +711,14 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, // A constant can't be in the dead pointer set. if (isa<Constant>(UnderlyingPointer)) return; - + // If the kill pointer can be easily reduced to an alloca, don't bother doing // extraneous AA queries. if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer)); return; } - + SmallVector<Value*, 16> NowLive; for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), E = DeadStackObjects.end(); I != E; ++I) { diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 3d3f17b..c0223d2 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -92,7 +92,7 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { // Hash in all of the operands as pointers. unsigned Res = 0; for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) - Res ^= getHash(Inst->getOperand(i)) << i; + Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); if (CastInst *CI = dyn_cast<CastInst>(Inst)) Res ^= getHash(CI->getType()); @@ -185,7 +185,7 @@ unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { assert(!Inst->getOperand(i)->getType()->isMetadataTy() && "Cannot value number calls with metadata operands"); - Res ^= getHash(Inst->getOperand(i)) << i; + Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); } // Mix in the opcode. @@ -357,7 +357,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If this is a non-volatile load, process it. if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { // Ignore volatile loads. - if (LI->isVolatile()) { + if (!LI->isSimple()) { LastStore = 0; continue; } @@ -437,7 +437,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); // Remember that this was the last store we saw for DSE. - if (!SI->isVolatile()) + if (SI->isSimple()) LastStore = SI; } } diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index b4d5667..a51cbb6 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -41,12 +41,16 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/IRBuilder.h" +#include "llvm/Support/PatternMatch.h" using namespace llvm; +using namespace PatternMatch; STATISTIC(NumGVNInstr, "Number of instructions deleted"); STATISTIC(NumGVNLoad, "Number of loads deleted"); STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); STATISTIC(NumGVNBlocks, "Number of blocks merged"); +STATISTIC(NumGVNSimpl, "Number of instructions simplified"); +STATISTIC(NumGVNEqProp, "Number of equalities propagated"); STATISTIC(NumPRELoad, "Number of loads PRE'd"); static cl::opt<bool> EnablePRE("enable-pre", @@ -548,6 +552,9 @@ namespace { void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); + unsigned replaceAllDominatedUsesWith(Value *From, Value *To, + BasicBlock *Root); + bool propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root); }; char GVN::ID = 0; @@ -689,8 +696,8 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // If this is already the right type, just return it. Type *StoredValTy = StoredVal->getType(); - uint64_t StoreSize = TD.getTypeStoreSizeInBits(StoredValTy); - uint64_t LoadSize = TD.getTypeStoreSizeInBits(LoadedTy); + uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy); + uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy); // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { @@ -920,7 +927,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, llvm::Type::getInt8PtrTy(Src->getContext())); Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1); + Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); if (ConstantFoldLoadFromConstPtr(Src, &TD)) return Offset; @@ -946,10 +953,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, // Compute which bits of the stored value are being used by the load. Convert // to an integer type to start with. if (SrcVal->getType()->isPointerTy()) - SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx), "tmp"); + SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx)); if (!SrcVal->getType()->isIntegerTy()) - SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8), - "tmp"); + SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); // Shift the bits to the least significant depending on endianness. unsigned ShiftAmt; @@ -959,11 +965,10 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, ShiftAmt = (StoreSize-LoadSize-Offset)*8; if (ShiftAmt) - SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt, "tmp"); + SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt); if (LoadSize != StoreSize) - SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8), - "tmp"); + SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8)); return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD); } @@ -982,8 +987,8 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType()); unsigned LoadSize = TD.getTypeStoreSize(LoadTy); if (Offset+LoadSize > SrcValSize) { - assert(!SrcVal->isVolatile() && "Cannot widen volatile load!"); - assert(isa<IntegerType>(SrcVal->getType())&&"Can't widen non-integer load"); + assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!"); + assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load"); // If we have a load/load clobber an DepLI can be widened to cover this // load, then we should widen it to the next power of 2 size big enough! unsigned NewLoadSize = Offset+LoadSize; @@ -1081,7 +1086,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, llvm::Type::getInt8PtrTy(Src->getContext())); Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1); + Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); return ConstantFoldLoadFromConstPtr(Src, &TD); } @@ -1274,7 +1279,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // If we had a phi translation failure, we'll have a single entry which is a // clobber in the current block. Reject this early. - if (Deps.size() == 1 && Deps[0].getResult().isUnknown()) { + if (Deps.size() == 1 + && !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) + { DEBUG( dbgs() << "GVN: non-local load "; WriteAsOperand(dbgs(), LI); @@ -1294,7 +1301,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); - if (DepInfo.isUnknown()) { + if (!DepInfo.isDef() && !DepInfo.isClobber()) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1359,7 +1366,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { continue; } - assert(DepInfo.isDef() && "Expecting def here"); + // DepInfo.isDef() here Instruction *DepInst = DepInfo.getInst(); @@ -1446,8 +1453,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) Blockers.insert(UnavailableBlocks[i]); - // Lets find first basic block with more than one predecessor. Walk backwards - // through predecessors if needed. + // Let's find the first basic block with more than one predecessor. Walk + // backwards through predecessors if needed. BasicBlock *LoadBB = LI->getParent(); BasicBlock *TmpBB = LoadBB; @@ -1519,10 +1526,19 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { << Pred->getName() << "': " << *LI << '\n'); return false; } + + if (LoadBB->isLandingPad()) { + DEBUG(dbgs() + << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '" + << Pred->getName() << "': " << *LI << '\n'); + return false; + } + unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB); NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum)); } } + if (!NeedToSplit.empty()) { toSplit.append(NeedToSplit.begin(), NeedToSplit.end()); return false; @@ -1660,7 +1676,7 @@ bool GVN::processLoad(LoadInst *L) { if (!MD) return false; - if (L->isVolatile()) + if (!L->isSimple()) return false; if (L->use_empty()) { @@ -1747,7 +1763,11 @@ bool GVN::processLoad(LoadInst *L) { return false; } - if (Dep.isUnknown()) { + // If it is defined in another block, try harder. + if (Dep.isNonLocal()) + return processNonLocalLoad(L); + + if (!Dep.isDef()) { DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; @@ -1757,12 +1777,6 @@ bool GVN::processLoad(LoadInst *L) { return false; } - // If it is defined in another block, try harder. - if (Dep.isNonLocal()) - return processNonLocalLoad(L); - - assert(Dep.isDef() && "Expecting def here"); - Instruction *DepInst = Dep.getInst(); if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { Value *StoredVal = DepSI->getValueOperand(); @@ -1874,6 +1888,138 @@ Value *GVN::findLeader(BasicBlock *BB, uint32_t num) { return Val; } +/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the +/// use is dominated by the given basic block. Returns the number of uses that +/// were replaced. +unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, + BasicBlock *Root) { + unsigned Count = 0; + for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE; ) { + Instruction *User = cast<Instruction>(*UI); + unsigned OpNum = UI.getOperandNo(); + ++UI; + + if (DT->dominates(Root, User->getParent())) { + User->setOperand(OpNum, To); + ++Count; + } + } + return Count; +} + +/// propagateEquality - The given values are known to be equal in every block +/// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with +/// 'RHS' everywhere in the scope. Returns whether a change was made. +bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) { + if (LHS == RHS) return false; + assert(LHS->getType() == RHS->getType() && "Equal but types differ!"); + + // Don't try to propagate equalities between constants. + if (isa<Constant>(LHS) && isa<Constant>(RHS)) + return false; + + // Make sure that any constants are on the right-hand side. In general the + // best results are obtained by placing the longest lived value on the RHS. + if (isa<Constant>(LHS)) + std::swap(LHS, RHS); + + // If neither term is constant then bail out. This is not for correctness, + // it's just that the non-constant case is much less useful: it occurs just + // as often as the constant case but handling it hardly ever results in an + // improvement. + if (!isa<Constant>(RHS)) + return false; + + // If value numbering later deduces that an instruction in the scope is equal + // to 'LHS' then ensure it will be turned into 'RHS'. + addToLeaderTable(VN.lookup_or_add(LHS), RHS, Root); + + // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As + // LHS always has at least one use that is not dominated by Root, this will + // never do anything if LHS has only one use. + bool Changed = false; + if (!LHS->hasOneUse()) { + unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root); + Changed |= NumReplacements > 0; + NumGVNEqProp += NumReplacements; + } + + // Now try to deduce additional equalities from this one. For example, if the + // known equality was "(A != B)" == "false" then it follows that A and B are + // equal in the scope. Only boolean equalities with an explicit true or false + // RHS are currently supported. + if (!RHS->getType()->isIntegerTy(1)) + // Not a boolean equality - bail out. + return Changed; + ConstantInt *CI = dyn_cast<ConstantInt>(RHS); + if (!CI) + // RHS neither 'true' nor 'false' - bail out. + return Changed; + // Whether RHS equals 'true'. Otherwise it equals 'false'. + bool isKnownTrue = CI->isAllOnesValue(); + bool isKnownFalse = !isKnownTrue; + + // If "A && B" is known true then both A and B are known true. If "A || B" + // is known false then both A and B are known false. + Value *A, *B; + if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) || + (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) { + Changed |= propagateEquality(A, RHS, Root); + Changed |= propagateEquality(B, RHS, Root); + return Changed; + } + + // If we are propagating an equality like "(A == B)" == "true" then also + // propagate the equality A == B. + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) { + // Only equality comparisons are supported. + if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) || + (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) { + Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); + Changed |= propagateEquality(Op0, Op1, Root); + } + return Changed; + } + + return Changed; +} + +/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'. Return +/// true if every path from the entry block to 'Dst' passes via this edge. In +/// particular 'Dst' must not be reachable via another edge from 'Src'. +static bool isOnlyReachableViaThisEdge(BasicBlock *Src, BasicBlock *Dst, + DominatorTree *DT) { + // First off, there must not be more than one edge from Src to Dst, there + // should be exactly one. So keep track of the number of times Src occurs + // as a predecessor of Dst and fail if it's more than once. Secondly, any + // other predecessors of Dst should be dominated by Dst (see logic below). + bool SawEdgeFromSrc = false; + for (pred_iterator PI = pred_begin(Dst), PE = pred_end(Dst); PI != PE; ++PI) { + BasicBlock *Pred = *PI; + if (Pred == Src) { + // An edge from Src to Dst. + if (SawEdgeFromSrc) + // There are multiple edges from Src to Dst - fail. + return false; + SawEdgeFromSrc = true; + continue; + } + // If the predecessor is not dominated by Dst, then it must be possible to + // reach it either without passing through Src (and thus not via the edge) + // or by passing through Src but taking a different edge out of Src. Either + // way it is possible to reach Dst without passing via the edge, so fail. + if (!DT->dominates(Dst, *PI)) + return false; + } + assert(SawEdgeFromSrc && "No edge between these basic blocks!"); + + // Every path from the entry block to Dst must at some point pass to Dst from + // a predecessor that is not dominated by Dst. This predecessor can only be + // Src, since all others are dominated by Dst. As there is only one edge from + // Src to Dst, the path passes by this edge. + return true; +} /// processInstruction - When calculating availability, handle an instruction /// by inserting it into the appropriate sets @@ -1891,6 +2037,7 @@ bool GVN::processInstruction(Instruction *I) { if (MD && V->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(I); + ++NumGVNSimpl; return true; } @@ -1903,30 +2050,45 @@ bool GVN::processInstruction(Instruction *I) { return false; } - // For conditions branches, we can perform simple conditional propagation on + // For conditional branches, we can perform simple conditional propagation on // the condition value itself. if (BranchInst *BI = dyn_cast<BranchInst>(I)) { if (!BI->isConditional() || isa<Constant>(BI->getCondition())) return false; - + Value *BranchCond = BI->getCondition(); - uint32_t CondVN = VN.lookup_or_add(BranchCond); - + BasicBlock *TrueSucc = BI->getSuccessor(0); BasicBlock *FalseSucc = BI->getSuccessor(1); - - if (TrueSucc->getSinglePredecessor()) - addToLeaderTable(CondVN, - ConstantInt::getTrue(TrueSucc->getContext()), - TrueSucc); - if (FalseSucc->getSinglePredecessor()) - addToLeaderTable(CondVN, - ConstantInt::getFalse(TrueSucc->getContext()), - FalseSucc); - - return false; + BasicBlock *Parent = BI->getParent(); + bool Changed = false; + + if (isOnlyReachableViaThisEdge(Parent, TrueSucc, DT)) + Changed |= propagateEquality(BranchCond, + ConstantInt::getTrue(TrueSucc->getContext()), + TrueSucc); + + if (isOnlyReachableViaThisEdge(Parent, FalseSucc, DT)) + Changed |= propagateEquality(BranchCond, + ConstantInt::getFalse(FalseSucc->getContext()), + FalseSucc); + + return Changed; } - + + // For switches, propagate the case values into the case destinations. + if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + Value *SwitchCond = SI->getCondition(); + BasicBlock *Parent = SI->getParent(); + bool Changed = false; + for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) { + BasicBlock *Dst = SI->getSuccessor(i); + if (isOnlyReachableViaThisEdge(Parent, Dst, DT)) + Changed |= propagateEquality(SwitchCond, SI->getCaseValue(i), Dst); + } + return Changed; + } + // Instructions with void type don't return a value, so there's // no point in trying to find redudancies in them. if (I->getType()->isVoidTy()) return false; @@ -2071,6 +2233,9 @@ bool GVN::performPRE(Function &F) { // Nothing to PRE in the entry block. if (CurrentBlock == &F.getEntryBlock()) continue; + // Don't perform PRE on a landing pad. + if (CurrentBlock->isLandingPad()) continue; + for (BasicBlock::iterator BI = CurrentBlock->begin(), BE = CurrentBlock->end(); BI != BE; ) { Instruction *CurInst = BI++; diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp new file mode 100644 index 0000000..0772b48 --- /dev/null +++ b/lib/Transforms/Scalar/GlobalMerge.cpp @@ -0,0 +1,226 @@ +//===-- GlobalMerge.cpp - Internal globals merging -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass merges globals with internal linkage into one. This way all the +// globals which were merged into a biggest one can be addressed using offsets +// from the same base pointer (no need for separate base pointer for each of the +// global). Such a transformation can significantly reduce the register pressure +// when many globals are involved. +// +// For example, consider the code which touches several global variables at +// once: +// +// static int foo[N], bar[N], baz[N]; +// +// for (i = 0; i < N; ++i) { +// foo[i] = bar[i] * baz[i]; +// } +// +// On ARM the addresses of 3 arrays should be kept in the registers, thus +// this code has quite large register pressure (loop body): +// +// ldr r1, [r5], #4 +// ldr r2, [r6], #4 +// mul r1, r2, r1 +// str r1, [r0], #4 +// +// Pass converts the code to something like: +// +// static struct { +// int foo[N]; +// int bar[N]; +// int baz[N]; +// } merged; +// +// for (i = 0; i < N; ++i) { +// merged.foo[i] = merged.bar[i] * merged.baz[i]; +// } +// +// and in ARM code this becomes: +// +// ldr r0, [r5, #40] +// ldr r1, [r5, #80] +// mul r0, r1, r0 +// str r0, [r5], #4 +// +// note that we saved 2 registers here almostly "for free". +// ===---------------------------------------------------------------------===// + +#define DEBUG_TYPE "global-merge" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Attributes.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumMerged , "Number of globals merged"); +namespace { + class GlobalMerge : public FunctionPass { + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// target type sizes. + const TargetLowering *TLI; + + bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, + Module &M, bool isConst) const; + + public: + static char ID; // Pass identification, replacement for typeid. + explicit GlobalMerge(const TargetLowering *tli = 0) + : FunctionPass(ID), TLI(tli) { + initializeGlobalMergePass(*PassRegistry::getPassRegistry()); + } + + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + + const char *getPassName() const { + return "Merge internal globals"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } + + struct GlobalCmp { + const TargetData *TD; + + GlobalCmp(const TargetData *td) : TD(td) { } + + bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) { + Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); + Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType(); + + return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2)); + } + }; + }; +} // end anonymous namespace + +char GlobalMerge::ID = 0; +INITIALIZE_PASS(GlobalMerge, "global-merge", + "Global Merge", false, false) + + +bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, + Module &M, bool isConst) const { + const TargetData *TD = TLI->getTargetData(); + + // FIXME: Infer the maximum possible offset depending on the actual users + // (these max offsets are different for the users inside Thumb or ARM + // functions) + unsigned MaxOffset = TLI->getMaximalGlobalOffset(); + + // FIXME: Find better heuristics + std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD)); + + Type *Int32Ty = Type::getInt32Ty(M.getContext()); + + for (size_t i = 0, e = Globals.size(); i != e; ) { + size_t j = 0; + uint64_t MergedSize = 0; + std::vector<Type*> Tys; + std::vector<Constant*> Inits; + for (j = i; j != e; ++j) { + Type *Ty = Globals[j]->getType()->getElementType(); + MergedSize += TD->getTypeAllocSize(Ty); + if (MergedSize > MaxOffset) { + break; + } + Tys.push_back(Ty); + Inits.push_back(Globals[j]->getInitializer()); + } + + StructType *MergedTy = StructType::get(M.getContext(), Tys); + Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); + GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst, + GlobalValue::InternalLinkage, + MergedInit, "_MergedGlobals"); + for (size_t k = i; k < j; ++k) { + Constant *Idx[2] = { + ConstantInt::get(Int32Ty, 0), + ConstantInt::get(Int32Ty, k-i) + }; + Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx); + Globals[k]->replaceAllUsesWith(GEP); + Globals[k]->eraseFromParent(); + NumMerged++; + } + i = j; + } + + return true; +} + + +bool GlobalMerge::doInitialization(Module &M) { + SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals; + const TargetData *TD = TLI->getTargetData(); + unsigned MaxOffset = TLI->getMaximalGlobalOffset(); + bool Changed = false; + + // Grab all non-const globals. + for (Module::global_iterator I = M.global_begin(), + E = M.global_end(); I != E; ++I) { + // Merge is safe for "normal" internal globals only + if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection()) + continue; + + // Ignore fancy-aligned globals for now. + unsigned Alignment = I->getAlignment(); + Type *Ty = I->getType()->getElementType(); + if (Alignment > TD->getABITypeAlignment(Ty)) + continue; + + // Ignore all 'special' globals. + if (I->getName().startswith("llvm.") || + I->getName().startswith(".llvm.")) + continue; + + if (TD->getTypeAllocSize(Ty) < MaxOffset) { + const TargetLoweringObjectFile &TLOF = TLI->getObjFileLowering(); + if (TLOF.getKindForGlobal(I, TLI->getTargetMachine()).isBSSLocal()) + BSSGlobals.push_back(I); + else if (I->isConstant()) + ConstGlobals.push_back(I); + else + Globals.push_back(I); + } + } + + if (Globals.size() > 1) + Changed |= doMerge(Globals, M, false); + if (BSSGlobals.size() > 1) + Changed |= doMerge(BSSGlobals, M, false); + + // FIXME: This currently breaks the EH processing due to way how the + // typeinfo detection works. We might want to detect the TIs and ignore + // them in the future. + // if (ConstGlobals.size() > 1) + // Changed |= doMerge(ConstGlobals, M, true); + + return Changed; +} + +bool GlobalMerge::runOnFunction(Function &F) { + return false; +} + +Pass *llvm::createGlobalMergePass(const TargetLowering *tli) { + return new GlobalMerge(tli); +} diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 50140d9..75fa011 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -11,17 +11,6 @@ // computations derived from them) into simpler forms suitable for subsequent // analysis and transformation. // -// This transformation makes the following changes to each loop with an -// identifiable induction variable: -// 1. All loops are transformed to have a SINGLE canonical induction variable -// which starts at zero and steps by one. -// 2. The canonical induction variable is guaranteed to be the first PHI node -// in the loop header block. -// 3. The canonical induction variable is guaranteed to be in a wide enough -// type so that IV expressions need not be (directly) zero-extended or -// sign-extended. -// 4. Any pointer arithmetic recurrences are raised to use array subscripts. -// // If the trip count of a loop is computable, this pass also makes the following // changes: // 1. The exit condition for the loop is canonicalized to compare the @@ -33,9 +22,6 @@ // purpose of the loop is to compute the exit value of some derived // expression, this transformation will make the loop dead. // -// This transformation should be followed by strength reduction after all of the -// desired loop transformations have been performed. -// //===----------------------------------------------------------------------===// #define DEBUG_TYPE "indvars" @@ -57,11 +43,11 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Target/TargetData.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" using namespace llvm; STATISTIC(NumRemoved , "Number of aux indvars removed"); @@ -69,21 +55,21 @@ STATISTIC(NumWidened , "Number of indvars widened"); STATISTIC(NumInserted , "Number of canonical indvars added"); STATISTIC(NumReplaced , "Number of exit values replaced"); STATISTIC(NumLFTR , "Number of loop exit tests replaced"); -STATISTIC(NumElimIdentity, "Number of IV identities eliminated"); STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); -STATISTIC(NumElimRem , "Number of IV remainder operations eliminated"); -STATISTIC(NumElimCmp , "Number of IV comparisons eliminated"); STATISTIC(NumElimIV , "Number of congruent IVs eliminated"); -static cl::opt<bool> DisableIVRewrite( - "disable-iv-rewrite", cl::Hidden, - cl::desc("Disable canonical induction variable rewriting")); - -// Temporary flag for use with -disable-iv-rewrite to force a canonical IV for -// LFTR purposes. -static cl::opt<bool> ForceLFTR( - "force-lftr", cl::Hidden, - cl::desc("Enable forced linear function test replacement")); +namespace llvm { + cl::opt<bool> EnableIVRewrite( + "enable-iv-rewrite", cl::Hidden, + cl::desc("Enable canonical induction variable rewriting")); + + // Trip count verification can be enabled by default under NDEBUG if we + // implement a strong expression equivalence checker in SCEV. Until then, we + // use the verify-indvars flag, which may assert in some cases. + cl::opt<bool> VerifyIndvars( + "verify-indvars", cl::Hidden, + cl::desc("Verify the ScalarEvolution result after running indvars")); +} namespace { class IndVarSimplify : public LoopPass { @@ -111,12 +97,12 @@ namespace { AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - if (!DisableIVRewrite) + if (EnableIVRewrite) AU.addRequired<IVUsers>(); AU.addPreserved<ScalarEvolution>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); - if (!DisableIVRewrite) + if (EnableIVRewrite) AU.addPreserved<IVUsers>(); AU.setPreservesCFG(); } @@ -131,18 +117,9 @@ namespace { void HandleFloatingPointIV(Loop *L, PHINode *PH); void RewriteNonIntegerIVs(Loop *L); - void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); - - void SimplifyIVUsers(SCEVExpander &Rewriter); - void SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter); - - bool EliminateIVUser(Instruction *UseInst, Instruction *IVOperand); - void EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); - void EliminateIVRemainder(BinaryOperator *Rem, - Value *IVOperand, - bool IsSigned); + void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM); - void SimplifyCongruentIVs(Loop *L); + void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter); @@ -240,8 +217,8 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def, InsertPt = InsertBB->getTerminator(); } assert(InsertPt && "Missing phi operand"); - assert(!isa<Instruction>(Def) || - DT->dominates(cast<Instruction>(Def), InsertPt) && + assert((!isa<Instruction>(Def) || + DT->dominates(cast<Instruction>(Def), InsertPt)) && "def does not dominate all uses"); return InsertPt; } @@ -372,14 +349,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // Positive and negative strides have different safety conditions. if (IncValue > 0) { // If we have a positive stride, we require the init to be less than the - // exit value and an equality or less than comparison. - if (InitValue >= ExitValue || - NewPred == CmpInst::ICMP_SGT || NewPred == CmpInst::ICMP_SGE) + // exit value. + if (InitValue >= ExitValue) return; uint32_t Range = uint32_t(ExitValue-InitValue); - if (NewPred == CmpInst::ICMP_SLE) { - // Normalize SLE -> SLT, check for infinite loop. + // Check for infinite loop, either: + // while (i <= Exit) or until (i > Exit) + if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) { if (++Range == 0) return; // Range overflows. } @@ -399,14 +376,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { } else { // If we have a negative stride, we require the init to be greater than the - // exit value and an equality or greater than comparison. - if (InitValue >= ExitValue || - NewPred == CmpInst::ICMP_SLT || NewPred == CmpInst::ICMP_SLE) + // exit value. + if (InitValue <= ExitValue) return; uint32_t Range = uint32_t(InitValue-ExitValue); - if (NewPred == CmpInst::ICMP_SGE) { - // Normalize SGE -> SGT, check for infinite loop. + // Check for infinite loop, either: + // while (i >= Exit) or until (i < Exit) + if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) { if (++Range == 0) return; // Range overflows. } @@ -464,7 +441,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // platforms. if (WeakPH) { Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", - PN->getParent()->getFirstNonPHI()); + PN->getParent()->getFirstInsertionPt()); PN->replaceAllUsesWith(Conv); RecursivelyDeleteTriviallyDeadInstructions(PN); } @@ -472,6 +449,8 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // Add a new IVUsers entry for the newly-created integer PHI. if (IU) IU->AddUsersIfInteresting(NewPHI); + + Changed = true; } void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { @@ -617,45 +596,15 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { //===----------------------------------------------------------------------===// // Rewrite IV users based on a canonical IV. -// To be replaced by -disable-iv-rewrite. +// Only for use with -enable-iv-rewrite. //===----------------------------------------------------------------------===// -/// SimplifyIVUsers - Iteratively perform simplification on IVUsers within this -/// loop. IVUsers is treated as a worklist. Each successive simplification may -/// push more users which may themselves be candidates for simplification. -/// -/// This is the old approach to IV simplification to be replaced by -/// SimplifyIVUsersNoRewrite. -/// -void IndVarSimplify::SimplifyIVUsers(SCEVExpander &Rewriter) { - // Each round of simplification involves a round of eliminating operations - // followed by a round of widening IVs. A single IVUsers worklist is used - // across all rounds. The inner loop advances the user. If widening exposes - // more uses, then another pass through the outer loop is triggered. - for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) { - Instruction *UseInst = I->getUser(); - Value *IVOperand = I->getOperandValToReplace(); - - if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { - EliminateIVComparison(ICmp, IVOperand); - continue; - } - if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) { - bool IsSigned = Rem->getOpcode() == Instruction::SRem; - if (IsSigned || Rem->getOpcode() == Instruction::URem) { - EliminateIVRemainder(Rem, IVOperand, IsSigned); - continue; - } - } - } -} - -// FIXME: It is an extremely bad idea to indvar substitute anything more -// complex than affine induction variables. Doing so will put expensive -// polynomial evaluations inside of the loop, and the str reduction pass -// currently can only reduce affine polynomials. For now just disable -// indvar subst on anything more complex than an affine addrec, unless -// it can be expanded to a trivial value. +/// FIXME: It is an extremely bad idea to indvar substitute anything more +/// complex than affine induction variables. Doing so will put expensive +/// polynomial evaluations inside of the loop, and the str reduction pass +/// currently can only reduce affine polynomials. For now just disable +/// indvar subst on anything more complex than an affine addrec, unless +/// it can be expanded to a trivial value. static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) { // Loop-invariant values are safe. if (SE->isLoopInvariant(S, L)) return true; @@ -666,7 +615,8 @@ static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) { return AR->isAffine(); // An add is safe it all its operands are safe. - if (const SCEVCommutativeExpr *Commutative = dyn_cast<SCEVCommutativeExpr>(S)) { + if (const SCEVCommutativeExpr *Commutative + = dyn_cast<SCEVCommutativeExpr>(S)) { for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(), E = Commutative->op_end(); I != E; ++I) if (!isSafe(*I, L, SE)) return false; @@ -771,18 +721,37 @@ namespace { // extend operations. This information is recorded by CollectExtend and // provides the input to WidenIV. struct WideIVInfo { + PHINode *NarrowIV; Type *WidestNativeType; // Widest integer type created [sz]ext - bool IsSigned; // Was an sext user seen before a zext? + bool IsSigned; // Was an sext user seen before a zext? + + WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {} + }; + + class WideIVVisitor : public IVVisitor { + ScalarEvolution *SE; + const TargetData *TD; - WideIVInfo() : WidestNativeType(0), IsSigned(false) {} + public: + WideIVInfo WI; + + WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV, + const TargetData *TData) : + SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; } + + // Implement the interface used by simplifyUsersOfIV. + virtual void visitCast(CastInst *Cast); }; } -/// CollectExtend - Update information about the induction variable that is +/// visitCast - Update information about the induction variable that is /// extended by this sign or zero extend operation. This is used to determine /// the final width of the IV before actually widening it. -static void CollectExtend(CastInst *Cast, bool IsSigned, WideIVInfo &WI, - ScalarEvolution *SE, const TargetData *TD) { +void WideIVVisitor::visitCast(CastInst *Cast) { + bool IsSigned = Cast->getOpcode() == Instruction::SExt; + if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) + return; + Type *Ty = Cast->getType(); uint64_t Width = SE->getTypeSizeInBits(Ty); if (TD && !TD->isLegalInteger(Width)) @@ -845,10 +814,10 @@ class WidenIV { SmallVector<NarrowIVDefUse, 8> NarrowIVUsers; public: - WidenIV(PHINode *PN, const WideIVInfo &WI, LoopInfo *LInfo, + WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv, DominatorTree *DTree, SmallVectorImpl<WeakVH> &DI) : - OrigPhi(PN), + OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), IsSigned(WI.IsSigned), LI(LInfo), @@ -865,18 +834,42 @@ public: PHINode *CreateWideIV(SCEVExpander &Rewriter); protected: + Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, + Instruction *Use); + Instruction *CloneIVUser(NarrowIVDefUse DU); const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse); + const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU); + Instruction *WidenIVUse(NarrowIVDefUse DU); void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; } // anonymous namespace -static Value *getExtend( Value *NarrowOper, Type *WideType, - bool IsSigned, IRBuilder<> &Builder) { +/// isLoopInvariant - Perform a quick domtree based check for loop invariance +/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems +/// gratuitous for this purpose. +static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) { + Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) + return true; + + return DT->properlyDominates(Inst->getParent(), L->getHeader()); +} + +Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, + Instruction *Use) { + // Set the debug location and conservative insertion point. + IRBuilder<> Builder(Use); + // Hoist the insertion point into loop preheaders as far as possible. + for (const Loop *L = LI->getLoopFor(Use->getParent()); + L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT); + L = L->getParentLoop()) + Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); + return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) : Builder.CreateZExt(NarrowOper, WideType); } @@ -901,22 +894,21 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { case Instruction::AShr: DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n"); - IRBuilder<> Builder(DU.NarrowUse); - // Replace NarrowDef operands with WideDef. Otherwise, we don't know // anything about the narrow operand yet so must insert a [sz]ext. It is // probably loop invariant and will be folded or hoisted. If it actually // comes from a widened IV, it should be removed during a future call to // WidenIVUse. Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef : - getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, Builder); + getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse); Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef : - getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, Builder); + getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse); BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse); BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, NarrowBO->getName()); + IRBuilder<> Builder(DU.NarrowUse); Builder.Insert(WideBO); if (const OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(NarrowBO)) { @@ -928,45 +920,46 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { llvm_unreachable(0); } -/// HoistStep - Attempt to hoist an IV increment above a potential use. -/// -/// To successfully hoist, two criteria must be met: -/// - IncV operands dominate InsertPos and -/// - InsertPos dominates IncV -/// -/// Meeting the second condition means that we don't need to check all of IncV's -/// existing uses (it's moving up in the domtree). -/// -/// This does not yet recursively hoist the operands, although that would -/// not be difficult. -static bool HoistStep(Instruction *IncV, Instruction *InsertPos, - const DominatorTree *DT) -{ - if (DT->dominates(IncV, InsertPos)) - return true; +/// No-wrap operations can transfer sign extension of their result to their +/// operands. Generate the SCEV value for the widened operation without +/// actually modifying the IR yet. If the expression after extending the +/// operands is an AddRec for this loop, return it. +const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { + // Handle the common case of add<nsw/nuw> + if (DU.NarrowUse->getOpcode() != Instruction::Add) + return 0; - if (!DT->dominates(InsertPos->getParent(), IncV->getParent())) - return false; + // One operand (NarrowDef) has already been extended to WideDef. Now determine + // if extending the other will lead to a recurrence. + unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0; + assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU"); + + const SCEV *ExtendOperExpr = 0; + const OverflowingBinaryOperator *OBO = + cast<OverflowingBinaryOperator>(DU.NarrowUse); + if (IsSigned && OBO->hasNoSignedWrap()) + ExtendOperExpr = SE->getSignExtendExpr( + SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); + else if(!IsSigned && OBO->hasNoUnsignedWrap()) + ExtendOperExpr = SE->getZeroExtendExpr( + SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); + else + return 0; - if (IncV->mayHaveSideEffects()) - return false; + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>( + SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr, + IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW)); - // Attempt to hoist IncV - for (User::op_iterator OI = IncV->op_begin(), OE = IncV->op_end(); - OI != OE; ++OI) { - Instruction *OInst = dyn_cast<Instruction>(OI); - if (OInst && !DT->dominates(OInst, InsertPos)) - return false; - } - IncV->moveBefore(InsertPos); - return true; + if (!AddRec || AddRec->getLoop() != L) + return 0; + return AddRec; } -// GetWideRecurrence - Is this instruction potentially interesting from IVUsers' -// perspective after widening it's type? In other words, can the extend be -// safely hoisted out of the loop with SCEV reducing the value to a recurrence -// on the same loop. If so, return the sign or zero extended -// recurrence. Otherwise return NULL. +/// GetWideRecurrence - Is this instruction potentially interesting from +/// IVUsers' perspective after widening it's type? In other words, can the +/// extend be safely hoisted out of the loop with SCEV reducing the value to a +/// recurrence on the same loop. If so, return the sign or zero extended +/// recurrence. Otherwise return NULL. const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { if (!SE->isSCEVable(NarrowUse->getType())) return 0; @@ -985,7 +978,6 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr); if (!AddRec || AddRec->getLoop() != L) return 0; - return AddRec; } @@ -1039,6 +1031,9 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU) { // Does this user itself evaluate to a recurrence after widening? const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse); if (!WideAddRec) { + WideAddRec = GetExtendedOperandRecurrence(DU); + } + if (!WideAddRec) { // This user does not evaluate to a recurence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. @@ -1055,9 +1050,9 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU) { // Reuse the IV increment that SCEVExpander created as long as it dominates // NarrowUse. Instruction *WideUse = 0; - if (WideAddRec == WideIncExpr && HoistStep(WideInc, DU.NarrowUse, DT)) { + if (WideAddRec == WideIncExpr + && SCEVExpander::hoistStep(WideInc, DU.NarrowUse, DT)) WideUse = WideInc; - } else { WideUse = CloneIVUser(DU); if (!WideUse) @@ -1178,183 +1173,17 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // Simplification of IV users based on SCEV evaluation. //===----------------------------------------------------------------------===// -void IndVarSimplify::EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { - unsigned IVOperIdx = 0; - ICmpInst::Predicate Pred = ICmp->getPredicate(); - if (IVOperand != ICmp->getOperand(0)) { - // Swapped - assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand"); - IVOperIdx = 1; - Pred = ICmpInst::getSwappedPredicate(Pred); - } - - // Get the SCEVs for the ICmp operands. - const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx)); - const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx)); - - // Simplify unnecessary loops away. - const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent()); - S = SE->getSCEVAtScope(S, ICmpLoop); - X = SE->getSCEVAtScope(X, ICmpLoop); - - // If the condition is always true or always false, replace it with - // a constant value. - if (SE->isKnownPredicate(Pred, S, X)) - ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); - else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) - ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); - else - return; - DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); - ++NumElimCmp; - Changed = true; - DeadInsts.push_back(ICmp); -} - -void IndVarSimplify::EliminateIVRemainder(BinaryOperator *Rem, - Value *IVOperand, - bool IsSigned) { - // We're only interested in the case where we know something about - // the numerator. - if (IVOperand != Rem->getOperand(0)) - return; - - // Get the SCEVs for the ICmp operands. - const SCEV *S = SE->getSCEV(Rem->getOperand(0)); - const SCEV *X = SE->getSCEV(Rem->getOperand(1)); - - // Simplify unnecessary loops away. - const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent()); - S = SE->getSCEVAtScope(S, ICmpLoop); - X = SE->getSCEVAtScope(X, ICmpLoop); - - // i % n --> i if i is in [0,n). - if ((!IsSigned || SE->isKnownNonNegative(S)) && - SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, - S, X)) - Rem->replaceAllUsesWith(Rem->getOperand(0)); - else { - // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). - const SCEV *LessOne = - SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); - if (IsSigned && !SE->isKnownNonNegative(LessOne)) - return; - - if (!SE->isKnownPredicate(IsSigned ? - ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, - LessOne, X)) - return; - - ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, - Rem->getOperand(0), Rem->getOperand(1), - "tmp"); - SelectInst *Sel = - SelectInst::Create(ICmp, - ConstantInt::get(Rem->getType(), 0), - Rem->getOperand(0), "tmp", Rem); - Rem->replaceAllUsesWith(Sel); - } - - // Inform IVUsers about the new users. - if (IU) { - if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) - IU->AddUsersIfInteresting(I); - } - DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); - ++NumElimRem; - Changed = true; - DeadInsts.push_back(Rem); -} - -/// EliminateIVUser - Eliminate an operation that consumes a simple IV and has -/// no observable side-effect given the range of IV values. -bool IndVarSimplify::EliminateIVUser(Instruction *UseInst, - Instruction *IVOperand) { - if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { - EliminateIVComparison(ICmp, IVOperand); - return true; - } - if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) { - bool IsSigned = Rem->getOpcode() == Instruction::SRem; - if (IsSigned || Rem->getOpcode() == Instruction::URem) { - EliminateIVRemainder(Rem, IVOperand, IsSigned); - return true; - } - } - - // Eliminate any operation that SCEV can prove is an identity function. - if (!SE->isSCEVable(UseInst->getType()) || - (UseInst->getType() != IVOperand->getType()) || - (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand))) - return false; - - DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n'); - - UseInst->replaceAllUsesWith(IVOperand); - ++NumElimIdentity; - Changed = true; - DeadInsts.push_back(UseInst); - return true; -} - -/// pushIVUsers - Add all uses of Def to the current IV's worklist. -/// -static void pushIVUsers( - Instruction *Def, - SmallPtrSet<Instruction*,16> &Simplified, - SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) { - - for (Value::use_iterator UI = Def->use_begin(), E = Def->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - - // Avoid infinite or exponential worklist processing. - // Also ensure unique worklist users. - // If Def is a LoopPhi, it may not be in the Simplified set, so check for - // self edges first. - if (User != Def && Simplified.insert(User)) - SimpleIVUsers.push_back(std::make_pair(User, Def)); - } -} - -/// isSimpleIVUser - Return true if this instruction generates a simple SCEV -/// expression in terms of that IV. -/// -/// This is similar to IVUsers' isInsteresting() but processes each instruction -/// non-recursively when the operand is already known to be a simpleIVUser. -/// -static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) { - if (!SE->isSCEVable(I->getType())) - return false; - - // Get the symbolic expression for this instruction. - const SCEV *S = SE->getSCEV(I); - - // Only consider affine recurrences. - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S); - if (AR && AR->getLoop() == L) - return true; - - return false; -} - -/// SimplifyIVUsersNoRewrite - Iteratively perform simplification on a worklist -/// of IV users. Each successive simplification may push more users which may +/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV +/// users. Each successive simplification may push more users which may /// themselves be candidates for simplification. /// -/// The "NoRewrite" algorithm does not require IVUsers analysis. Instead, it -/// simplifies instructions in-place during analysis. Rather than rewriting -/// induction variables bottom-up from their users, it transforms a chain of -/// IVUsers top-down, updating the IR only when it encouters a clear -/// optimization opportunitiy. A SCEVExpander "Rewriter" instance is still -/// needed, but only used to generate a new IV (phi) of wider type for sign/zero -/// extend elimination. +/// Sign/Zero extend elimination is interleaved with IV simplification. /// -/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers. -/// -void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) { - std::map<PHINode *, WideIVInfo> WideIVMap; +void IndVarSimplify::SimplifyAndExtend(Loop *L, + SCEVExpander &Rewriter, + LPPassManager &LPM) { + SmallVector<WideIVInfo, 8> WideIVs; SmallVector<PHINode*, 8> LoopPhis; for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { @@ -1370,108 +1199,27 @@ void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) { // extension. The first time SCEV attempts to normalize sign/zero extension, // the result becomes final. So for the most predictable results, we delay // evaluation of sign/zero extend evaluation until needed, and avoid running - // other SCEV based analysis prior to SimplifyIVUsersNoRewrite. + // other SCEV based analysis prior to SimplifyAndExtend. do { PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. - WideIVInfo WI; - - // Instructions processed by SimplifyIVUsers for CurrIV. - SmallPtrSet<Instruction*,16> Simplified; + WideIVVisitor WIV(CurrIV, SE, TD); - // Use-def pairs if IV users waiting to be processed for CurrIV. - SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers; + Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV); - // Push users of the current LoopPhi. In rare cases, pushIVUsers may be - // called multiple times for the same LoopPhi. This is the proper thing to - // do for loop header phis that use each other. - pushIVUsers(CurrIV, Simplified, SimpleIVUsers); - - while (!SimpleIVUsers.empty()) { - Instruction *UseInst, *Operand; - tie(UseInst, Operand) = SimpleIVUsers.pop_back_val(); - // Bypass back edges to avoid extra work. - if (UseInst == CurrIV) continue; - - if (EliminateIVUser(UseInst, Operand)) { - pushIVUsers(Operand, Simplified, SimpleIVUsers); - continue; - } - if (CastInst *Cast = dyn_cast<CastInst>(UseInst)) { - bool IsSigned = Cast->getOpcode() == Instruction::SExt; - if (IsSigned || Cast->getOpcode() == Instruction::ZExt) { - CollectExtend(Cast, IsSigned, WI, SE, TD); - } - continue; - } - if (isSimpleIVUser(UseInst, L, SE)) { - pushIVUsers(UseInst, Simplified, SimpleIVUsers); - } - } - if (WI.WidestNativeType) { - WideIVMap[CurrIV] = WI; + if (WIV.WI.WidestNativeType) { + WideIVs.push_back(WIV.WI); } } while(!LoopPhis.empty()); - for (std::map<PHINode *, WideIVInfo>::const_iterator I = WideIVMap.begin(), - E = WideIVMap.end(); I != E; ++I) { - WidenIV Widener(I->first, I->second, LI, SE, DT, DeadInsts); + for (; !WideIVs.empty(); WideIVs.pop_back()) { + WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts); if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) { Changed = true; LoopPhis.push_back(WidePhi); } } - WideIVMap.clear(); - } -} - -/// SimplifyCongruentIVs - Check for congruent phis in this loop header and -/// populate ExprToIVMap for use later. -/// -void IndVarSimplify::SimplifyCongruentIVs(Loop *L) { - DenseMap<const SCEV *, PHINode *> ExprToIVMap; - for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { - PHINode *Phi = cast<PHINode>(I); - if (!SE->isSCEVable(Phi->getType())) - continue; - - const SCEV *S = SE->getSCEV(Phi); - DenseMap<const SCEV *, PHINode *>::const_iterator Pos; - bool Inserted; - tie(Pos, Inserted) = ExprToIVMap.insert(std::make_pair(S, Phi)); - if (Inserted) - continue; - PHINode *OrigPhi = Pos->second; - - // If one phi derives from the other via GEPs, types may differ. - if (OrigPhi->getType() != Phi->getType()) - continue; - - // Replacing the congruent phi is sufficient because acyclic redundancy - // elimination, CSE/GVN, should handle the rest. However, once SCEV proves - // that a phi is congruent, it's almost certain to be the head of an IV - // user cycle that is isomorphic with the original phi. So it's worth - // eagerly cleaning up the common case of a single IV increment. - if (BasicBlock *LatchBlock = L->getLoopLatch()) { - Instruction *OrigInc = - cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock)); - Instruction *IsomorphicInc = - cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock)); - if (OrigInc != IsomorphicInc && - OrigInc->getType() == IsomorphicInc->getType() && - SE->getSCEV(OrigInc) == SE->getSCEV(IsomorphicInc) && - HoistStep(OrigInc, IsomorphicInc, DT)) { - DEBUG(dbgs() << "INDVARS: Eliminated congruent iv.inc: " - << *IsomorphicInc << '\n'); - IsomorphicInc->replaceAllUsesWith(OrigInc); - DeadInsts.push_back(IsomorphicInc); - } - } - DEBUG(dbgs() << "INDVARS: Eliminated congruent iv: " << *Phi << '\n'); - ++NumElimIV; - Phi->replaceAllUsesWith(OrigPhi); - DeadInsts.push_back(Phi); } } @@ -1479,9 +1227,9 @@ void IndVarSimplify::SimplifyCongruentIVs(Loop *L) { // LinearFunctionTestReplace and its kin. Rewrite the loop exit condition. //===----------------------------------------------------------------------===// -// Check for expressions that ScalarEvolution generates to compute -// BackedgeTakenInfo. If these expressions have not been reduced, then expanding -// them may incur additional cost (albeit in the loop preheader). +/// Check for expressions that ScalarEvolution generates to compute +/// BackedgeTakenInfo. If these expressions have not been reduced, then +/// expanding them may incur additional cost (albeit in the loop preheader). static bool isHighCostExpansion(const SCEV *S, BranchInst *BI, ScalarEvolution *SE) { // If the backedge-taken count is a UDiv, it's very likely a UDiv that @@ -1502,7 +1250,7 @@ static bool isHighCostExpansion(const SCEV *S, BranchInst *BI, } } - if (!DisableIVRewrite || ForceLFTR) + if (EnableIVRewrite) return false; // Recurse past add expressions, which commonly occur in the @@ -1580,17 +1328,6 @@ static Type *getBackedgeIVType(Loop *L) { return Ty; } -/// isLoopInvariant - Perform a quick domtree based check for loop invariance -/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems -/// gratuitous for this purpose. -static bool isLoopInvariant(Value *V, Loop *L, DominatorTree *DT) { - Instruction *Inst = dyn_cast<Instruction>(V); - if (!Inst) - return true; - - return DT->properlyDominates(Inst->getParent(), L->getHeader()); -} - /// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop /// invariant value to the phi. static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { @@ -1779,10 +1516,9 @@ LinearFunctionTestReplace(Loop *L, assert(canExpandBackedgeTakenCount(L, SE) && "precondition"); BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator()); - // In DisableIVRewrite mode, IndVar is not necessarily a canonical IV. In this - // mode, LFTR can ignore IV overflow and truncate to the width of + // LFTR can ignore IV overflow and truncate to the width of // BECount. This avoids materializing the add(zext(add)) expression. - Type *CntTy = DisableIVRewrite ? + Type *CntTy = !EnableIVRewrite ? BackedgeTakenCount->getType() : IndVar->getType(); const SCEV *IVLimit = BackedgeTakenCount; @@ -1832,7 +1568,7 @@ LinearFunctionTestReplace(Loop *L, const SCEV *IVInit = AR->getStart(); // For pointer types, sign extend BECount in order to materialize a GEP. - // Note that for DisableIVRewrite, we never run SCEVExpander on a + // Note that for without EnableIVRewrite, we never run SCEVExpander on a // pointer type, because we must preserve the existing GEPs. Instead we // directly generate a GEP later. if (IVInit->getType()->isPointerTy()) { @@ -1919,7 +1655,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) return; - Instruction *InsertPt = ExitBlock->getFirstNonPHI(); + Instruction *InsertPt = ExitBlock->getFirstInsertionPt(); BasicBlock::iterator I = Preheader->getTerminator(); while (I != Preheader->begin()) { --I; @@ -1940,6 +1676,10 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { if (isa<DbgInfoIntrinsic>(I)) continue; + // Skip landingpad instructions. + if (isa<LandingPadInst>(I)) + continue; + // Don't sink static AllocaInsts out of the entry block, which would // turn them into dynamic allocas! if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) @@ -2006,7 +1746,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!L->isLoopSimplifyForm()) return false; - if (!DisableIVRewrite) + if (EnableIVRewrite) IU = &getAnalysis<IVUsers>(); LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); @@ -2024,6 +1764,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Create a rewriter object which we'll use to transform the code with. SCEVExpander Rewriter(*SE, "indvars"); +#ifndef NDEBUG + Rewriter.setDebugType(DEBUG_TYPE); +#endif // Eliminate redundant IV users. // @@ -2031,9 +1774,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // attempt to avoid evaluating SCEVs for sign/zero extend operations until // other expressions involving loop IVs have been evaluated. This helps SCEV // set no-wrap flags before normalizing sign/zero extension. - if (DisableIVRewrite) { + if (!EnableIVRewrite) { Rewriter.disableCanonicalMode(); - SimplifyIVUsersNoRewrite(L, Rewriter); + SimplifyAndExtend(L, Rewriter, LPM); } // Check to see if this loop has a computable loop-invariant execution count. @@ -2046,26 +1789,25 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { RewriteLoopExitValues(L, Rewriter); // Eliminate redundant IV users. - if (!DisableIVRewrite) - SimplifyIVUsers(Rewriter); + if (EnableIVRewrite) + Changed |= simplifyIVUsers(IU, SE, &LPM, DeadInsts); // Eliminate redundant IV cycles. - if (DisableIVRewrite) - SimplifyCongruentIVs(L); + if (!EnableIVRewrite) + NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); // Compute the type of the largest recurrence expression, and decide whether // a canonical induction variable should be inserted. Type *LargestType = 0; bool NeedCannIV = false; - bool ReuseIVForExit = DisableIVRewrite && !ForceLFTR; bool ExpandBECount = canExpandBackedgeTakenCount(L, SE); - if (ExpandBECount && !ReuseIVForExit) { + if (EnableIVRewrite && ExpandBECount) { // If we have a known trip count and a single exit block, we'll be // rewriting the loop exit test condition below, which requires a // canonical induction variable. NeedCannIV = true; Type *Ty = BackedgeTakenCount->getType(); - if (DisableIVRewrite) { + if (!EnableIVRewrite) { // In this mode, SimplifyIVUsers may have already widened the IV used by // the backedge test and inserted a Trunc on the compare's operand. Get // the wider type to avoid creating a redundant narrow IV only used by the @@ -2077,7 +1819,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { SE->getTypeSizeInBits(LargestType)) LargestType = SE->getEffectiveSCEVType(Ty); } - if (!DisableIVRewrite) { + if (EnableIVRewrite) { for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) { NeedCannIV = true; Type *Ty = @@ -2119,10 +1861,10 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // the end of the pass. while (!OldCannIVs.empty()) { PHINode *OldCannIV = OldCannIVs.pop_back_val(); - OldCannIV->insertBefore(L->getHeader()->getFirstNonPHI()); + OldCannIV->insertBefore(L->getHeader()->getFirstInsertionPt()); } } - else if (ExpandBECount && ReuseIVForExit && needsLFTR(L, DT)) { + else if (!EnableIVRewrite && ExpandBECount && needsLFTR(L, DT)) { IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD); } // If we have a trip count expression, rewrite the loop's exit condition @@ -2143,7 +1885,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter); } // Rewrite IV-derived expressions. - if (!DisableIVRewrite) + if (EnableIVRewrite) RewriteIVExpressions(L, Rewriter); // Clear the rewriter cache, because values that are in the rewriter's cache @@ -2180,7 +1922,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Verify that LFTR, and any other change have not interfered with SCEV's // ability to compute trip count. #ifndef NDEBUG - if (DisableIVRewrite && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { + if (!EnableIVRewrite && VerifyIndvars && + !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { SE->forgetLoop(L); const SCEV *NewBECount = SE->getBackedgeTakenCount(L); if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) < diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index b500d5b..f410af3 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -811,8 +811,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { /// important optimization that encourages jump threading, and needs to be run /// interlaced with other jump threading tasks. bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { - // Don't hack volatile loads. - if (LI->isVolatile()) return false; + // Don't hack volatile/atomic loads. + if (!LI->isSimple()) return false; // If the load is defined in a block with exactly one predecessor, it can't be // partially redundant. diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 66add6c..b79bb13 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -151,6 +151,11 @@ namespace { /// bool isSafeToExecuteUnconditionally(Instruction &I); + /// isGuaranteedToExecute - Check that the instruction is guaranteed to + /// execute. + /// + bool isGuaranteedToExecute(Instruction &I); + /// pointerInvalidatedByLoop - Return true if the body of this loop may /// store into the memory location pointed to by V. /// @@ -357,8 +362,8 @@ void LICM::HoistRegion(DomTreeNode *N) { bool LICM::canSinkOrHoistInst(Instruction &I) { // Loads have extra constraints we have to verify before we can hoist them. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { - if (LI->isVolatile()) - return false; // Don't hoist volatile loads! + if (!LI->isUnordered()) + return false; // Don't hoist volatile/atomic loads! // Loads from constant memory are always safe to move, even if they end up // in the same alias set as something that ends up being modified. @@ -461,7 +466,7 @@ void LICM::sink(Instruction &I) { } else { // Move the instruction to the start of the exit block, after any PHI // nodes in it. - I.moveBefore(ExitBlocks[0]->getFirstNonPHI()); + I.moveBefore(ExitBlocks[0]->getFirstInsertionPt()); // This instruction is no longer in the AST for the current loop, because // we just sunk it out of the loop. If we just sunk it into an outer @@ -504,7 +509,7 @@ void LICM::sink(Instruction &I) { continue; // Insert the code after the last PHI node. - BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI(); + BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); // If this is the first exit block processed, just move the original // instruction, otherwise clone the original instruction and insert @@ -577,6 +582,10 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { if (Inst.isSafeToSpeculativelyExecute()) return true; + return isGuaranteedToExecute(Inst); +} + +bool LICM::isGuaranteedToExecute(Instruction &Inst) { // Otherwise we have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop // which does not execute this instruction, so we can't hoist it. @@ -635,7 +644,7 @@ namespace { for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { BasicBlock *ExitBlock = LoopExitBlocks[i]; Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); - Instruction *InsertPos = ExitBlock->getFirstNonPHI(); + Instruction *InsertPos = ExitBlock->getFirstInsertionPt(); StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos); NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); @@ -713,34 +722,41 @@ void LICM::PromoteAliasSet(AliasSet &AS) { // If there is an non-load/store instruction in the loop, we can't promote // it. - unsigned InstAlignment; if (LoadInst *load = dyn_cast<LoadInst>(Use)) { - assert(!cast<LoadInst>(Use)->isVolatile() && "AST broken"); - InstAlignment = load->getAlignment(); + assert(!load->isVolatile() && "AST broken"); + if (!load->isSimple()) + return; } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. if (Use->getOperand(1) != ASIV) continue; - InstAlignment = store->getAlignment(); - assert(!cast<StoreInst>(Use)->isVolatile() && "AST broken"); + assert(!store->isVolatile() && "AST broken"); + if (!store->isSimple()) + return; + + // Note that we only check GuaranteedToExecute inside the store case + // so that we do not introduce stores where they did not exist before + // (which would break the LLVM concurrency model). + + // If the alignment of this instruction allows us to specify a more + // restrictive (and performant) alignment and if we are sure this + // instruction will be executed, update the alignment. + // Larger is better, with the exception of 0 being the best alignment. + unsigned InstAlignment = store->getAlignment(); + if ((InstAlignment > Alignment || InstAlignment == 0) + && (Alignment != 0)) + if (isGuaranteedToExecute(*Use)) { + GuaranteedToExecute = true; + Alignment = InstAlignment; + } + + if (!GuaranteedToExecute) + GuaranteedToExecute = isGuaranteedToExecute(*Use); + } else return; // Not a load or store. - // If the alignment of this instruction allows us to specify a more - // restrictive (and performant) alignment and if we are sure this - // instruction will be executed, update the alignment. - // Larger is better, with the exception of 0 being the best alignment. - if ((InstAlignment > Alignment || InstAlignment == 0) - && (Alignment != 0)) - if (isSafeToExecuteUnconditionally(*Use)) { - GuaranteedToExecute = true; - Alignment = InstAlignment; - } - - if (!GuaranteedToExecute) - GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use); - LoopUses.push_back(Use); } } diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index ea4c515..ad15cbb 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -267,7 +267,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, /// processLoopStore - See if this store can be promoted to a memset or memcpy. bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { - if (SI->isVolatile()) return false; + if (!SI->isSimple()) return false; Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); @@ -314,7 +314,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && - StoreEv->getOperand(1) == LoadEv->getOperand(1) && !LI->isVolatile()) + StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple()) if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) return true; } @@ -463,7 +463,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, SplatValue = 0; } else { // Otherwise, this isn't an idiom we can transform. For example, we can't - // do anything with a 3-byte store, for example. + // do anything with a 3-byte store. return false; } diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index e90b5bc..3e122c2 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -70,12 +70,27 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/DenseSet.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include <algorithm> using namespace llvm; +namespace llvm { +cl::opt<bool> EnableNested( + "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops")); + +cl::opt<bool> EnableRetry( + "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry")); + +// Temporary flag to cleanup congruent phis after LSR phi expansion. +// It's currently disabled until we can determine whether it's truly useful or +// not. The flag should be removed after the v3.0 release. +cl::opt<bool> EnablePhiElim( + "enable-lsr-phielim", cl::Hidden, cl::desc("Enable LSR phi elimination")); +} + namespace { /// RegSortData - This class holds data which is used to order reuse candidates. @@ -670,6 +685,21 @@ public: void Loose(); +#ifndef NDEBUG + // Once any of the metrics loses, they must all remain losers. + bool isValid() { + return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds + | ImmCost | SetupCost) != ~0u) + || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds + & ImmCost & SetupCost) == ~0u); + } +#endif + + bool isLoser() { + assert(isValid() && "invalid cost"); + return NumRegs == ~0u; + } + void RateFormula(const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, @@ -702,34 +732,48 @@ void Cost::RateRegister(const SCEV *Reg, if (AR->getLoop() == L) AddRecCost += 1; /// TODO: This should be a function of the stride. - // If this is an addrec for a loop that's already been visited by LSR, - // don't second-guess its addrec phi nodes. LSR isn't currently smart - // enough to reason about more than one loop at a time. Consider these - // registers free and leave them alone. - else if (L->contains(AR->getLoop()) || + // If this is an addrec for another loop, don't second-guess its addrec phi + // nodes. LSR isn't currently smart enough to reason about more than one + // loop at a time. LSR has either already run on inner loops, will not run + // on other loops, and cannot be expected to change sibling loops. If the + // AddRec exists, consider it's register free and leave it alone. Otherwise, + // do not consider this formula at all. + // FIXME: why do we need to generate such fomulae? + else if (!EnableNested || L->contains(AR->getLoop()) || (!AR->getLoop()->contains(L) && DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) { for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); - PHINode *PN = dyn_cast<PHINode>(I); ++I) + PHINode *PN = dyn_cast<PHINode>(I); ++I) { if (SE.isSCEVable(PN->getType()) && (SE.getEffectiveSCEVType(PN->getType()) == SE.getEffectiveSCEVType(AR->getType())) && SE.getSCEV(PN) == AR) return; - + } + if (!EnableNested) { + Loose(); + return; + } // If this isn't one of the addrecs that the loop already has, it // would require a costly new phi and add. TODO: This isn't // precisely modeled right now. ++NumBaseAdds; - if (!Regs.count(AR->getStart())) + if (!Regs.count(AR->getStart())) { RateRegister(AR->getStart(), Regs, L, SE, DT); + if (isLoser()) + return; + } } // Add the step value register, if it needs one. // TODO: The non-affine case isn't precisely modeled here. - if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) - if (!Regs.count(AR->getStart())) + if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { + if (!Regs.count(AR->getOperand(1))) { RateRegister(AR->getOperand(1), Regs, L, SE, DT); + if (isLoser()) + return; + } + } } ++NumRegs; @@ -769,6 +813,8 @@ void Cost::RateFormula(const Formula &F, return; } RatePrimaryRegister(ScaledReg, Regs, L, SE, DT); + if (isLoser()) + return; } for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { @@ -778,6 +824,8 @@ void Cost::RateFormula(const Formula &F, return; } RatePrimaryRegister(BaseReg, Regs, L, SE, DT); + if (isLoser()) + return; } // Determine how many (unfolded) adds we'll need inside the loop. @@ -795,6 +843,7 @@ void Cost::RateFormula(const Formula &F, else if (Offset != 0) ImmCost += APInt(64, Offset, true).getMinSignedBits(); } + assert(isValid() && "invalid cost"); } /// Loose - Set this cost to a losing value. @@ -1156,7 +1205,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. if (AM.BaseOffs != 0) { - if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs); + if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs); return false; } @@ -1427,6 +1476,7 @@ void LSRInstance::OptimizeShadowIV() { ++UI; Instruction *ShadowUse = CandidateUI->getUser(); Type *DestTy = NULL; + bool IsSigned = false; /* If shadow use is a int->float cast then insert a second IV to eliminate this cast. @@ -1440,10 +1490,14 @@ void LSRInstance::OptimizeShadowIV() { for (unsigned i = 0; i < n; ++i, ++d) foo(d); */ - if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) + if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) { + IsSigned = false; DestTy = UCast->getDestTy(); - else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) + } + else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) { + IsSigned = true; DestTy = SCast->getDestTy(); + } if (!DestTy) continue; if (TLI) { @@ -1474,7 +1528,9 @@ void LSRInstance::OptimizeShadowIV() { ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry)); if (!Init) continue; - Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue()); + Constant *NewInit = ConstantFP::get(DestTy, IsSigned ? + (double)Init->getSExtValue() : + (double)Init->getZExtValue()); BinaryOperator *Incr = dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch)); @@ -3275,6 +3331,9 @@ retry: skip:; } + if (!EnableRetry && !AnySatisfiedReqRegs) + return; + // If none of the formulae had all of the required registers, relax the // constraint so that we don't exclude all formulae. if (!AnySatisfiedReqRegs) { @@ -3298,6 +3357,10 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { // SolveRecurse does all the work. SolveRecurse(Solution, SolutionCost, Workspace, CurCost, CurRegs, VisitedRegs); + if (Solution.empty()) { + DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); + return; + } // Ok, we've now made all our decisions. DEBUG(dbgs() << "\n" @@ -3416,6 +3479,9 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP, // Don't insert instructions before PHI nodes. while (isa<PHINode>(IP)) ++IP; + // Ignore landingpad instructions. + while (isa<LandingPadInst>(IP)) ++IP; + // Ignore debug intrinsics. while (isa<DbgInfoIntrinsic>(IP)) ++IP; @@ -3527,7 +3593,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // The other interesting way of "folding" with an ICmpZero is to use a // negated immediate. if (!ICmpScaledV) - ICmpScaledV = ConstantInt::get(IntTy, -Offset); + ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); else { Ops.push_back(SE.getUnknown(ICmpScaledV)); ICmpScaledV = ConstantInt::get(IntTy, Offset); @@ -3611,10 +3677,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN, // users. if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && !isa<IndirectBrInst>(BB->getTerminator())) { - Loop *PNLoop = LI.getLoopFor(PN->getParent()); - if (!PNLoop || PN->getParent() != PNLoop->getHeader()) { + BasicBlock *Parent = PN->getParent(); + Loop *PNLoop = LI.getLoopFor(Parent); + if (!PNLoop || Parent != PNLoop->getHeader()) { // Split the critical edge. - BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P); + BasicBlock *NewBB = 0; + if (!Parent->isLandingPad()) { + NewBB = SplitCriticalEdge(BB, Parent, P, + /*MergeIdenticalEdges=*/true, + /*DontDeleteUselessPhis=*/true); + } else { + SmallVector<BasicBlock*, 2> NewBBs; + SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); + NewBB = NewBBs[0]; + } // If PN is outside of the loop and BB is in the loop, we want to // move the block to be immediately before the PHI block, not @@ -3700,6 +3776,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, SCEVExpander Rewriter(SE, "lsr"); Rewriter.disableCanonicalMode(); + Rewriter.enableLSRMode(); Rewriter.setIVIncInsertPos(L, IVIncInsertPos); // Expand the new value definitions and update the users. @@ -3740,6 +3817,23 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) OptimizeShadowIV(); OptimizeLoopTermCond(); + // If loop preparation eliminates all interesting IV users, bail. + if (IU.empty()) return; + + // Skip nested loops until we can model them better with formulae. + if (!EnableNested && !L->empty()) { + + if (EnablePhiElim) { + // Remove any extra phis created by processing inner loops. + SmallVector<WeakVH, 16> DeadInsts; + SCEVExpander Rewriter(SE, "lsr"); + Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts); + Changed |= DeleteTriviallyDeadInstructions(DeadInsts); + } + DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); + return; + } + // Start collecting data and preparing for the solver. CollectInterestingTypesAndFactors(); CollectFixupsAndInitialFormulae(); @@ -3763,6 +3857,9 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) Types.clear(); RegUses.clear(); + if (Solution.empty()) + return; + #ifndef NDEBUG // Formulae should be legal. for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), @@ -3778,6 +3875,14 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) // Now that we've decided what we want, make it so. ImplementSolution(Solution, P); + + if (EnablePhiElim) { + // Remove any extra phis created by processing inner loops. + SmallVector<WeakVH, 16> DeadInsts; + SCEVExpander Rewriter(SE, "lsr"); + Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts); + Changed |= DeleteTriviallyDeadInstructions(DeadInsts); + } } void LSRInstance::print_factors_and_types(raw_ostream &OS) const { diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index fef6bc3..91395b2 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -22,6 +22,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Target/TargetData.h" #include <climits> using namespace llvm; @@ -39,6 +40,11 @@ UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden, cl::desc("Allows loops to be partially unrolled until " "-unroll-threshold loop size is reached.")); +// Temporary flag to be removed in 3.0 +static cl::opt<bool> +NoSCEVUnroll("disable-unroll-scev", cl::init(false), cl::Hidden, + cl::desc("Use ScalarEvolution to analyze loop trip counts for unrolling")); + namespace { class LoopUnroll : public LoopPass { public: @@ -49,7 +55,7 @@ namespace { CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); - + initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -57,11 +63,11 @@ namespace { /// that the loop unroll should be performed regardless of how much /// code expansion would result. static const unsigned NoThreshold = UINT_MAX; - + // Threshold to use when optsize is specified (and there is no // explicit -unroll-threshold). static const unsigned OptSizeUnrollThreshold = 50; - + unsigned CurrentCount; unsigned CurrentThreshold; bool CurrentAllowPartial; @@ -79,6 +85,7 @@ namespace { AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); + AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. // If loop unroll does not preserve dom info then LCSSA pass on next @@ -101,45 +108,62 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { } /// ApproximateLoopSize - Approximate the size of the loop. -static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls) { +static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, + const TargetData *TD) { CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) - Metrics.analyzeBasicBlock(*I); + Metrics.analyzeBasicBlock(*I, TD); NumCalls = Metrics.NumInlineCandidates; - + unsigned LoopSize = Metrics.NumInsts; - + // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's // not a problem for code quality. if (LoopSize == 0) LoopSize = 1; - + return LoopSize; } bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { LoopInfo *LI = &getAnalysis<LoopInfo>(); + ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() << "] Loop %" << Header->getName() << "\n"); (void)Header; - + // Determine the current unrolling threshold. While this is normally set // from UnrollThreshold, it is overridden to a smaller value if the current // function is marked as optimize-for-size, and the unroll threshold was // not user specified. unsigned Threshold = CurrentThreshold; - if (!UserThreshold && + if (!UserThreshold && Header->getParent()->hasFnAttr(Attribute::OptimizeForSize)) Threshold = OptSizeUnrollThreshold; - // Find trip count - unsigned TripCount = L->getSmallConstantTripCount(); - unsigned Count = CurrentCount; - + // Find trip count and trip multiple if count is not available + unsigned TripCount = 0; + unsigned TripMultiple = 1; + if (!NoSCEVUnroll) { + // Find "latch trip count". UnrollLoop assumes that control cannot exit + // via the loop latch on any iteration prior to TripCount. The loop may exit + // early via an earlier branch. + BasicBlock *LatchBlock = L->getLoopLatch(); + if (LatchBlock) { + TripCount = SE->getSmallConstantTripCount(L, LatchBlock); + TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); + } + } + else { + TripCount = L->getSmallConstantTripCount(); + if (TripCount == 0) + TripMultiple = L->getSmallConstantTripMultiple(); + } // Automatically select an unroll count. + unsigned Count = CurrentCount; if (Count == 0) { // Conservative heuristic: if we know the trip count, see if we can // completely unroll (subject to the threshold, checked below); otherwise @@ -152,8 +176,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Enforce the threshold. if (Threshold != NoThreshold) { + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); unsigned NumInlineCandidates; - unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates); + unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); if (NumInlineCandidates != 0) { DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); @@ -182,12 +207,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Unroll the loop. - Function *F = L->getHeader()->getParent(); - if (!UnrollLoop(L, Count, LI, &LPM)) + if (!UnrollLoop(L, Count, TripCount, TripMultiple, LI, &LPM)) return false; - // FIXME: Reconstruct dom info, because it is not preserved properly. - if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) - DT->runOnFunction(*F); return true; } diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 840c4b6..458949c 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -492,7 +492,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, Value *BranchVal = LIC; if (!isa<ConstantInt>(Val) || Val->getType() != Type::getInt1Ty(LIC->getContext())) - BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val, "tmp"); + BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val); else if (Val != ConstantInt::getTrue(Val->getContext())) // We want to enter the new loop when the condition is true. std::swap(TrueDest, FalseDest); @@ -561,10 +561,17 @@ void LoopUnswitch::SplitExitEdges(Loop *L, BasicBlock *ExitBlock = ExitBlocks[i]; SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock), pred_end(ExitBlock)); + // Although SplitBlockPredecessors doesn't preserve loop-simplify in // general, if we call it on all predecessors of all exits then it does. - SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(), - ".us-lcssa", this); + if (!ExitBlock->isLandingPad()) { + SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(), + ".us-lcssa", this); + } else { + SmallVector<BasicBlock*, 2> NewBBs; + SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa", + this, NewBBs); + } } } @@ -632,7 +639,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // as well. ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase()); } - + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]); // The new exit block should be in the same loop as the old one. @@ -653,6 +660,19 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, if (It != VMap.end()) V = It->second; PN->addIncoming(V, NewExit); } + + if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { + PN = PHINode::Create(LPad->getType(), 0, "", + ExitSucc->getFirstInsertionPt()); + + for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc); + I != E; ++I) { + BasicBlock *BB = *I; + LandingPadInst *LPI = BB->getLandingPadInst(); + LPI->replaceAllUsesWith(PN); + PN->addIncoming(LPI, BB); + } + } } // Rewrite the code to refer to itself. diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp index 9087b46..689bbe9 100644 --- a/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/lib/Transforms/Scalar/LowerAtomic.cpp @@ -20,98 +20,88 @@ #include "llvm/Support/IRBuilder.h" using namespace llvm; -static bool LowerAtomicIntrinsic(IntrinsicInst *II) { - IRBuilder<> Builder(II->getParent(), II); - unsigned IID = II->getIntrinsicID(); - switch (IID) { - case Intrinsic::memory_barrier: - break; +static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { + IRBuilder<> Builder(CXI->getParent(), CXI); + Value *Ptr = CXI->getPointerOperand(); + Value *Cmp = CXI->getCompareOperand(); + Value *Val = CXI->getNewValOperand(); + + LoadInst *Orig = Builder.CreateLoad(Ptr); + Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); + Value *Res = Builder.CreateSelect(Equal, Val, Orig); + Builder.CreateStore(Res, Ptr); + + CXI->replaceAllUsesWith(Orig); + CXI->eraseFromParent(); + return true; +} - case Intrinsic::atomic_load_add: - case Intrinsic::atomic_load_sub: - case Intrinsic::atomic_load_and: - case Intrinsic::atomic_load_nand: - case Intrinsic::atomic_load_or: - case Intrinsic::atomic_load_xor: - case Intrinsic::atomic_load_max: - case Intrinsic::atomic_load_min: - case Intrinsic::atomic_load_umax: - case Intrinsic::atomic_load_umin: { - Value *Ptr = II->getArgOperand(0), *Delta = II->getArgOperand(1); +static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { + IRBuilder<> Builder(RMWI->getParent(), RMWI); + Value *Ptr = RMWI->getPointerOperand(); + Value *Val = RMWI->getValOperand(); - LoadInst *Orig = Builder.CreateLoad(Ptr); - Value *Res = NULL; - switch (IID) { - default: assert(0 && "Unrecognized atomic modify operation"); - case Intrinsic::atomic_load_add: - Res = Builder.CreateAdd(Orig, Delta); - break; - case Intrinsic::atomic_load_sub: - Res = Builder.CreateSub(Orig, Delta); - break; - case Intrinsic::atomic_load_and: - Res = Builder.CreateAnd(Orig, Delta); - break; - case Intrinsic::atomic_load_nand: - Res = Builder.CreateNot(Builder.CreateAnd(Orig, Delta)); - break; - case Intrinsic::atomic_load_or: - Res = Builder.CreateOr(Orig, Delta); - break; - case Intrinsic::atomic_load_xor: - Res = Builder.CreateXor(Orig, Delta); - break; - case Intrinsic::atomic_load_max: - Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), - Delta, Orig); - break; - case Intrinsic::atomic_load_min: - Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), - Orig, Delta); - break; - case Intrinsic::atomic_load_umax: - Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), - Delta, Orig); - break; - case Intrinsic::atomic_load_umin: - Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), - Orig, Delta); - break; - } - Builder.CreateStore(Res, Ptr); + LoadInst *Orig = Builder.CreateLoad(Ptr); + Value *Res = NULL; - II->replaceAllUsesWith(Orig); + switch (RMWI->getOperation()) { + default: llvm_unreachable("Unexpected RMW operation"); + case AtomicRMWInst::Xchg: + Res = Val; break; - } - - case Intrinsic::atomic_swap: { - Value *Ptr = II->getArgOperand(0), *Val = II->getArgOperand(1); - LoadInst *Orig = Builder.CreateLoad(Ptr); - Builder.CreateStore(Val, Ptr); - II->replaceAllUsesWith(Orig); + case AtomicRMWInst::Add: + Res = Builder.CreateAdd(Orig, Val); break; - } - - case Intrinsic::atomic_cmp_swap: { - Value *Ptr = II->getArgOperand(0), *Cmp = II->getArgOperand(1); - Value *Val = II->getArgOperand(2); - - LoadInst *Orig = Builder.CreateLoad(Ptr); - Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); - Value *Res = Builder.CreateSelect(Equal, Val, Orig); - Builder.CreateStore(Res, Ptr); - II->replaceAllUsesWith(Orig); + case AtomicRMWInst::Sub: + Res = Builder.CreateSub(Orig, Val); + break; + case AtomicRMWInst::And: + Res = Builder.CreateAnd(Orig, Val); + break; + case AtomicRMWInst::Nand: + Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val)); + break; + case AtomicRMWInst::Or: + Res = Builder.CreateOr(Orig, Val); + break; + case AtomicRMWInst::Xor: + Res = Builder.CreateXor(Orig, Val); + break; + case AtomicRMWInst::Max: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), + Val, Orig); + break; + case AtomicRMWInst::Min: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), + Orig, Val); + break; + case AtomicRMWInst::UMax: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), + Val, Orig); + break; + case AtomicRMWInst::UMin: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), + Orig, Val); break; } + Builder.CreateStore(Res, Ptr); + RMWI->replaceAllUsesWith(Orig); + RMWI->eraseFromParent(); + return true; +} - default: - return false; - } +static bool LowerFenceInst(FenceInst *FI) { + FI->eraseFromParent(); + return true; +} - assert(II->use_empty() && - "Lowering should have eliminated any uses of the intrinsic call!"); - II->eraseFromParent(); +static bool LowerLoadInst(LoadInst *LI) { + LI->setAtomic(NotAtomic); + return true; +} +static bool LowerStoreInst(StoreInst *SI) { + SI->setAtomic(NotAtomic); return true; } @@ -123,9 +113,22 @@ namespace { } bool runOnBasicBlock(BasicBlock &BB) { bool Changed = false; - for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DI++)) - Changed |= LowerAtomicIntrinsic(II); + for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { + Instruction *Inst = DI++; + if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) + Changed |= LowerFenceInst(FI); + else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst)) + Changed |= LowerAtomicCmpXchgInst(CXI); + else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst)) + Changed |= LowerAtomicRMWInst(RMWI); + else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + if (LI->isAtomic()) + LowerLoadInst(LI); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (SI->isAtomic()) + LowerStoreInst(SI); + } + } return Changed; } }; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index ba5ee68..298d692 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -384,7 +384,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { // If this is a store, see if we can merge it in. - if (NextStore->isVolatile()) break; + if (!NextStore->isSimple()) break; // Check to see if this stored value is of the same byte-splattable value. if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) @@ -479,7 +479,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { - if (SI->isVolatile()) return false; + if (!SI->isSimple()) return false; if (TD == 0) return false; @@ -487,7 +487,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { - if (!LI->isVolatile() && LI->hasOneUse() && + if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { MemDepResult ldep = MD->getDependency(LI); CallInst *C = 0; @@ -806,21 +806,26 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. MemDepResult DepInfo = MD->getDependency(M); - if (!DepInfo.isClobber()) - return false; - - if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst())) - return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); - - if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { - if (performCallSlotOptzn(M, M->getDest(), M->getSource(), - CopySize->getZExtValue(), C)) { - MD->removeInstruction(M); - M->eraseFromParent(); - return true; + if (DepInfo.isClobber()) { + if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { + if (performCallSlotOptzn(M, M->getDest(), M->getSource(), + CopySize->getZExtValue(), C)) { + MD->removeInstruction(M); + M->eraseFromParent(); + return true; + } } } + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis::Location SrcLoc = AA.getLocationForSource(M); + MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, + M, M->getParent()); + if (SrcDepInfo.isClobber()) { + if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) + return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); + } + return false; } @@ -860,7 +865,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); - Type *ByValTy =cast<PointerType>(ByValArg->getType())->getElementType(); + Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp index f2e5ff9..80f5f01 100644 --- a/lib/Transforms/Scalar/ObjCARC.cpp +++ b/lib/Transforms/Scalar/ObjCARC.cpp @@ -344,6 +344,10 @@ static InstructionClass GetInstructionClass(const Value *V) { break; default: // For anything else, check all the operands. + // Note that this includes both operands of a Store: while the first + // operand isn't actually being dereferenced, it is being stored to + // memory where we can no longer track who might read it and dereference + // it, so we have to consider it potentially used. for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) if (IsPotentialUse(*OI)) @@ -421,9 +425,10 @@ static bool IsAlwaysTail(InstructionClass Class) { /// IsNoThrow - Test if the given class represents instructions which are always /// safe to mark with the nounwind attribute.. static bool IsNoThrow(InstructionClass Class) { + // objc_retainBlock is not nounwind because it calls user copy constructors + // which could theoretically throw. return Class == IC_Retain || Class == IC_RetainRV || - Class == IC_RetainBlock || Class == IC_Release || Class == IC_Autorelease || Class == IC_AutoreleaseRV || @@ -515,6 +520,10 @@ static bool IsObjCIdentifiedObject(const Value *V) { const Value *Pointer = StripPointerCastsAndObjCCalls(LI->getPointerOperand()); if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) { + // A constant pointer can't be pointing to an object on the heap. It may + // be reference-counted, but it won't be deleted. + if (GV->isConstant()) + return true; StringRef Name = GV->getName(); // These special variables are known to hold values which are not // reference-counted pointers. @@ -738,7 +747,6 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { switch (GetBasicInstructionClass(CS.getInstruction())) { case IC_Retain: case IC_RetainRV: - case IC_RetainBlock: case IC_Autorelease: case IC_AutoreleaseRV: case IC_NoopCast: @@ -746,6 +754,8 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { case IC_FusedRetainAutorelease: case IC_FusedRetainAutoreleaseRV: // These functions don't access any memory visible to the compiler. + // Note that this doesn't include objc_retainBlock, becuase it updates + // pointers when it copies block data. return NoModRef; default: break; @@ -877,7 +887,9 @@ bool ObjCARCExpand::runOnFunction(Function &F) { // usually can't sink them past other calls, which would be the main // case where it would be useful. -/// TODO: The pointer returned from objc_loadWeakRetained is retained. +// TODO: The pointer returned from objc_loadWeakRetained is retained. + +// TODO: Delete release+retain pairs (rare). #include "llvm/GlobalAlias.h" #include "llvm/Constants.h" @@ -1098,16 +1110,16 @@ static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) { if (A == S_None || B == S_None) return S_None; - // Note that we can't merge S_CanRelease and S_Use. if (A > B) std::swap(A, B); if (TopDown) { // Choose the side which is further along in the sequence. - if (A == S_Retain && (B == S_CanRelease || B == S_Use)) + if ((A == S_Retain || A == S_CanRelease) && + (B == S_CanRelease || B == S_Use)) return B; } else { // Choose the side which is further along in the sequence. if ((A == S_Use || A == S_CanRelease) && - (B == S_Release || B == S_Stop || B == S_MovableRelease)) + (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease)) return A; // If both sides are releases, choose the more conservative one. if (A == S_Stop && (B == S_Release || B == S_MovableRelease)) @@ -1124,22 +1136,37 @@ namespace { /// retain-decrement-use-release sequence or release-use-decrement-retain /// reverese sequence. struct RRInfo { - /// KnownIncremented - After an objc_retain, the reference count of the - /// referenced object is known to be positive. Similarly, before an - /// objc_release, the reference count of the referenced object is known to - /// be positive. If there are retain-release pairs in code regions where the - /// retain count is known to be positive, they can be eliminated, regardless - /// of any side effects between them. - bool KnownIncremented; + /// KnownSafe - After an objc_retain, the reference count of the referenced + /// object is known to be positive. Similarly, before an objc_release, the + /// reference count of the referenced object is known to be positive. If + /// there are retain-release pairs in code regions where the retain count + /// is known to be positive, they can be eliminated, regardless of any side + /// effects between them. + /// + /// Also, a retain+release pair nested within another retain+release + /// pair all on the known same pointer value can be eliminated, regardless + /// of any intervening side effects. + /// + /// KnownSafe is true when either of these conditions is satisfied. + bool KnownSafe; /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as /// opposed to objc_retain calls). bool IsRetainBlock; + /// CopyOnEscape - True if this the Calls are objc_retainBlock calls + /// which all have the !clang.arc.copy_on_escape metadata. + bool CopyOnEscape; + /// IsTailCallRelease - True of the objc_release calls are all marked /// with the "tail" keyword. bool IsTailCallRelease; + /// Partial - True of we've seen an opportunity for partial RR elimination, + /// such as pushing calls into a CFG triangle or into one side of a + /// CFG diamond. + bool Partial; + /// ReleaseMetadata - If the Calls are objc_release calls and they all have /// a clang.imprecise_release tag, this is the metadata tag. MDNode *ReleaseMetadata; @@ -1153,7 +1180,8 @@ namespace { SmallPtrSet<Instruction *, 2> ReverseInsertPts; RRInfo() : - KnownIncremented(false), IsRetainBlock(false), IsTailCallRelease(false), + KnownSafe(false), IsRetainBlock(false), CopyOnEscape(false), + IsTailCallRelease(false), Partial(false), ReleaseMetadata(0) {} void clear(); @@ -1161,9 +1189,11 @@ namespace { } void RRInfo::clear() { - KnownIncremented = false; + KnownSafe = false; IsRetainBlock = false; + CopyOnEscape = false; IsTailCallRelease = false; + Partial = false; ReleaseMetadata = 0; Calls.clear(); ReverseInsertPts.clear(); @@ -1176,6 +1206,9 @@ namespace { /// RefCount - The known minimum number of reference count increments. unsigned RefCount; + /// NestCount - The known minimum level of retain+release nesting. + unsigned NestCount; + /// Seq - The current position in the sequence. Sequence Seq; @@ -1184,7 +1217,11 @@ namespace { /// TODO: Encapsulate this better. RRInfo RRI; - PtrState() : RefCount(0), Seq(S_None) {} + PtrState() : RefCount(0), NestCount(0), Seq(S_None) {} + + void SetAtLeastOneRefCount() { + if (RefCount == 0) RefCount = 1; + } void IncrementRefCount() { if (RefCount != UINT_MAX) ++RefCount; @@ -1194,14 +1231,22 @@ namespace { if (RefCount != 0) --RefCount; } - void ClearRefCount() { - RefCount = 0; - } - bool IsKnownIncremented() const { return RefCount > 0; } + void IncrementNestCount() { + if (NestCount != UINT_MAX) ++NestCount; + } + + void DecrementNestCount() { + if (NestCount != 0) --NestCount; + } + + bool IsKnownNested() const { + return NestCount > 0; + } + void SetSeq(Sequence NewSeq) { Seq = NewSeq; } @@ -1233,23 +1278,40 @@ void PtrState::Merge(const PtrState &Other, bool TopDown) { Seq = MergeSeqs(Seq, Other.Seq, TopDown); RefCount = std::min(RefCount, Other.RefCount); + NestCount = std::min(NestCount, Other.NestCount); // We can't merge a plain objc_retain with an objc_retainBlock. if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock) Seq = S_None; + // If we're not in a sequence (anymore), drop all associated state. if (Seq == S_None) { RRI.clear(); + } else if (RRI.Partial || Other.RRI.Partial) { + // If we're doing a merge on a path that's previously seen a partial + // merge, conservatively drop the sequence, to avoid doing partial + // RR elimination. If the branch predicates for the two merge differ, + // mixing them is unsafe. + Seq = S_None; + RRI.clear(); } else { // Conservatively merge the ReleaseMetadata information. if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata) RRI.ReleaseMetadata = 0; - RRI.KnownIncremented = RRI.KnownIncremented && Other.RRI.KnownIncremented; + RRI.CopyOnEscape = RRI.CopyOnEscape && Other.RRI.CopyOnEscape; + RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe; RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease; RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end()); - RRI.ReverseInsertPts.insert(Other.RRI.ReverseInsertPts.begin(), - Other.RRI.ReverseInsertPts.end()); + + // Merge the insert point sets. If there are any differences, + // that makes this a partial merge. + RRI.Partial = RRI.ReverseInsertPts.size() != + Other.RRI.ReverseInsertPts.size(); + for (SmallPtrSet<Instruction *, 2>::const_iterator + I = Other.RRI.ReverseInsertPts.begin(), + E = Other.RRI.ReverseInsertPts.end(); I != E; ++I) + RRI.Partial |= RRI.ReverseInsertPts.insert(*I); } } @@ -1316,7 +1378,7 @@ namespace { } void clearBottomUpPointers() { - PerPtrTopDown.clear(); + PerPtrBottomUp.clear(); } void clearTopDownPointers() { @@ -1334,6 +1396,12 @@ namespace { unsigned GetAllPathCount() const { return TopDownPathCount * BottomUpPathCount; } + + /// IsVisitedTopDown - Test whether the block for this BBState has been + /// visited by the top-down portion of the algorithm. + bool isVisitedTopDown() const { + return TopDownPathCount != 0; + } }; } @@ -1364,7 +1432,7 @@ void BBState::MergePred(const BBState &Other) { /*TopDown=*/true); } - // For each entry in our set, if the other set doens't have an entry with the + // For each entry in our set, if the other set doesn't have an entry with the // same key, force it to merge with an empty entry. for (ptr_iterator MI = top_down_ptr_begin(), ME = top_down_ptr_end(); MI != ME; ++MI) @@ -1389,7 +1457,7 @@ void BBState::MergeSucc(const BBState &Other) { /*TopDown=*/false); } - // For each entry in our set, if the other set doens't have an entry + // For each entry in our set, if the other set doesn't have an entry // with the same key, force it to merge with an empty entry. for (ptr_iterator MI = bottom_up_ptr_begin(), ME = bottom_up_ptr_end(); MI != ME; ++MI) @@ -1406,15 +1474,11 @@ namespace { /// Run - A flag indicating whether this optimization pass should run. bool Run; - /// RetainFunc, RelaseFunc - Declarations for objc_retain, - /// objc_retainBlock, and objc_release. - Function *RetainFunc, *RetainBlockFunc, *RetainRVFunc, *ReleaseFunc; - /// RetainRVCallee, etc. - Declarations for ObjC runtime /// functions, for use in creating calls to them. These are initialized /// lazily to avoid cluttering up the Module with unused declarations. Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee, - *RetainCallee, *AutoreleaseCallee; + *RetainCallee, *RetainBlockCallee, *AutoreleaseCallee; /// UsedInThisFunciton - Flags which determine whether each of the /// interesting runtine functions is in fact used in the current function. @@ -1424,10 +1488,15 @@ namespace { /// metadata. unsigned ImpreciseReleaseMDKind; + /// CopyOnEscape - The Metadata Kind for clang.arc.copy_on_escape + /// metadata. + unsigned CopyOnEscapeMDKind; + Constant *getRetainRVCallee(Module *M); Constant *getAutoreleaseRVCallee(Module *M); Constant *getReleaseCallee(Module *M); Constant *getRetainCallee(Module *M); + Constant *getRetainBlockCallee(Module *M); Constant *getAutoreleaseCallee(Module *M); void OptimizeRetainCall(Function &F, Instruction *Retain); @@ -1452,11 +1521,13 @@ namespace { void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove, MapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases, - SmallVectorImpl<Instruction *> &DeadInsts); + SmallVectorImpl<Instruction *> &DeadInsts, + Module *M); bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates, MapVector<Value *, RRInfo> &Retains, - DenseMap<Value *, RRInfo> &Releases); + DenseMap<Value *, RRInfo> &Releases, + Module *M); void OptimizeWeakCalls(Function &F); @@ -1561,6 +1632,23 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) { return RetainCallee; } +Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { + if (!RetainBlockCallee) { + LLVMContext &C = M->getContext(); + std::vector<Type *> Params; + Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); + AttrListPtr Attributes; + // objc_retainBlock is not nounwind because it calls user copy constructors + // which could theoretically throw. + RetainBlockCallee = + M->getOrInsertFunction( + "objc_retainBlock", + FunctionType::get(Params[0], Params, /*isVarArg=*/false), + Attributes); + } + return RetainBlockCallee; +} + Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { if (!AutoreleaseCallee) { LLVMContext &C = M->getContext(); @@ -1904,12 +1992,19 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) { // Check for a return of the pointer value. const Value *Ptr = GetObjCArg(AutoreleaseRV); - for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end(); - UI != UE; ++UI) { - const User *I = *UI; - if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV) - return; - } + SmallVector<const Value *, 2> Users; + Users.push_back(Ptr); + do { + Ptr = Users.pop_back_val(); + for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end(); + UI != UE; ++UI) { + const User *I = *UI; + if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV) + return; + if (isa<BitCastInst>(I)) + Users.push_back(I); + } + } while (!Users.empty()); Changed = true; ++NumPeeps; @@ -2132,41 +2227,49 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); bool SomeSuccHasSame = false; bool AllSuccsHaveSame = true; - for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) - switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) { + PtrState &S = MyStates.getPtrTopDownState(Arg); + for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) { + PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg); + switch (SuccS.GetSeq()) { case S_None: - case S_CanRelease: - MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); - SomeSuccHasSame = false; - break; + case S_CanRelease: { + if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe) + S.ClearSequenceProgress(); + continue; + } case S_Use: SomeSuccHasSame = true; break; case S_Stop: case S_Release: case S_MovableRelease: - AllSuccsHaveSame = false; + if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe) + AllSuccsHaveSame = false; break; case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); } + } // If the state at the other end of any of the successor edges // matches the current state, require all edges to match. This // guards against loops in the middle of a sequence. if (SomeSuccHasSame && !AllSuccsHaveSame) - MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); + S.ClearSequenceProgress(); } case S_CanRelease: { const Value *Arg = I->first; const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); bool SomeSuccHasSame = false; bool AllSuccsHaveSame = true; - for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) - switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) { - case S_None: - MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); - SomeSuccHasSame = false; - break; + PtrState &S = MyStates.getPtrTopDownState(Arg); + for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) { + PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg); + switch (SuccS.GetSeq()) { + case S_None: { + if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe) + S.ClearSequenceProgress(); + continue; + } case S_CanRelease: SomeSuccHasSame = true; break; @@ -2174,16 +2277,18 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, case S_Release: case S_MovableRelease: case S_Use: - AllSuccsHaveSame = false; + if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe) + AllSuccsHaveSame = false; break; case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); } + } // If the state at the other end of any of the successor edges // matches the current state, require all edges to match. This // guards against loops in the middle of a sequence. if (SomeSuccHasSame && !AllSuccsHaveSame) - MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); + S.ClearSequenceProgress(); } } } @@ -2207,6 +2312,8 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, if (Succ == BB) continue; DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); + // If we haven't seen this node yet, then we've found a CFG cycle. + // Be optimistic here; it's CheckForCFGHazards' job detect trouble. if (I == BBStates.end()) continue; MyStates.InitFromSucc(I->second); @@ -2245,11 +2352,12 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, S.SetSeqToRelease(Inst->getMetadata(ImpreciseReleaseMDKind)); S.RRI.clear(); - S.RRI.KnownIncremented = S.IsKnownIncremented(); + S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented(); S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); S.RRI.Calls.insert(Inst); S.IncrementRefCount(); + S.IncrementNestCount(); break; } case IC_RetainBlock: @@ -2259,6 +2367,16 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, PtrState &S = MyStates.getPtrBottomUpState(Arg); S.DecrementRefCount(); + S.SetAtLeastOneRefCount(); + S.DecrementNestCount(); + + // An non-copy-on-escape objc_retainBlock call with just a use still + // needs to be kept, because it may be copying a block from the stack + // to the heap. + if (Class == IC_RetainBlock && + !Inst->getMetadata(CopyOnEscapeMDKind) && + S.GetSeq() == S_Use) + S.SetSeq(S_CanRelease); switch (S.GetSeq()) { case S_Stop: @@ -2272,6 +2390,8 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, // better to let it remain as the first instruction after a call. if (Class != IC_RetainRV) { S.RRI.IsRetainBlock = Class == IC_RetainBlock; + if (S.RRI.IsRetainBlock) + S.RRI.CopyOnEscape = !!Inst->getMetadata(CopyOnEscapeMDKind); Retains[Inst] = S.RRI; } S.ClearSequenceProgress(); @@ -2281,7 +2401,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); } - break; + continue; } case IC_AutoreleasepoolPop: // Conservatively, clear MyStates for all known pointers. @@ -2305,26 +2425,22 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, PtrState &S = MI->second; Sequence Seq = S.GetSeq(); - // Check for possible retains and releases. + // Check for possible releases. if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - // Check for a retain (we're going bottom-up here). S.DecrementRefCount(); - - // Check for a release. - if (!IsRetain(Class) && Class != IC_RetainBlock) - switch (Seq) { - case S_Use: - S.SetSeq(S_CanRelease); - continue; - case S_CanRelease: - case S_Release: - case S_MovableRelease: - case S_Stop: - case S_None: - break; - case S_Retain: - llvm_unreachable("bottom-up pointer in retain state!"); - } + switch (Seq) { + case S_Use: + S.SetSeq(S_CanRelease); + continue; + case S_CanRelease: + case S_Release: + case S_MovableRelease: + case S_Stop: + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } } // Check for possible direct uses. @@ -2332,14 +2448,14 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, case S_Release: case S_MovableRelease: if (CanUse(Inst, Ptr, PA, Class)) { - S.RRI.ReverseInsertPts.clear(); + assert(S.RRI.ReverseInsertPts.empty()); S.RRI.ReverseInsertPts.insert(Inst); S.SetSeq(S_Use); } else if (Seq == S_Release && (Class == IC_User || Class == IC_CallOrUser)) { // Non-movable releases depend on any possible objc pointer use. S.SetSeq(S_Stop); - S.RRI.ReverseInsertPts.clear(); + assert(S.RRI.ReverseInsertPts.empty()); S.RRI.ReverseInsertPts.insert(Inst); } break; @@ -2378,14 +2494,18 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, if (Pred == BB) continue; DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); - if (I == BBStates.end()) + assert(I != BBStates.end()); + // If we haven't seen this node yet, then we've found a CFG cycle. + // Be optimistic here; it's CheckForCFGHazards' job detect trouble. + if (!I->second.isVisitedTopDown()) continue; MyStates.InitFromPred(I->second); while (PI != PE) { Pred = *PI++; if (Pred != BB) { I = BBStates.find(Pred); - if (I != BBStates.end()) + assert(I != BBStates.end()); + if (I->second.isVisitedTopDown()) MyStates.MergePred(I->second); } } @@ -2422,18 +2542,25 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, S.SetSeq(S_Retain); S.RRI.clear(); S.RRI.IsRetainBlock = Class == IC_RetainBlock; - S.RRI.KnownIncremented = S.IsKnownIncremented(); + if (S.RRI.IsRetainBlock) + S.RRI.CopyOnEscape = !!Inst->getMetadata(CopyOnEscapeMDKind); + // Don't check S.IsKnownIncremented() here because it's not + // sufficient. + S.RRI.KnownSafe = S.IsKnownNested(); S.RRI.Calls.insert(Inst); } + S.SetAtLeastOneRefCount(); S.IncrementRefCount(); - break; + S.IncrementNestCount(); + continue; } case IC_Release: { Arg = GetObjCArg(Inst); PtrState &S = MyStates.getPtrTopDownState(Arg); S.DecrementRefCount(); + S.DecrementNestCount(); switch (S.GetSeq()) { case S_Retain: @@ -2478,16 +2605,12 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, Sequence Seq = S.GetSeq(); // Check for possible releases. - if (!IsRetain(Class) && Class != IC_RetainBlock && - CanAlterRefCount(Inst, Ptr, PA, Class)) { - // Check for a release. + if (CanAlterRefCount(Inst, Ptr, PA, Class)) { S.DecrementRefCount(); - - // Check for a release. switch (Seq) { case S_Retain: S.SetSeq(S_CanRelease); - S.RRI.ReverseInsertPts.clear(); + assert(S.RRI.ReverseInsertPts.empty()); S.RRI.ReverseInsertPts.insert(Inst); // One call can't cause a transition from S_Retain to S_CanRelease @@ -2511,8 +2634,19 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, if (CanUse(Inst, Ptr, PA, Class)) S.SetSeq(S_Use); break; - case S_Use: case S_Retain: + // A non-copy-on-scape objc_retainBlock call may be responsible for + // copying the block data from the stack to the heap. Model this by + // moving it straight from S_Retain to S_Use. + if (S.RRI.IsRetainBlock && + !S.RRI.CopyOnEscape && + CanUse(Inst, Ptr, PA, Class)) { + assert(S.RRI.ReverseInsertPts.empty()); + S.RRI.ReverseInsertPts.insert(Inst); + S.SetSeq(S_Use); + } + break; + case S_Use: case S_None: break; case S_Stop: @@ -2533,28 +2667,43 @@ ObjCARCOpt::Visit(Function &F, DenseMap<const BasicBlock *, BBState> &BBStates, MapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases) { - // Use postorder for bottom-up, and reverse-postorder for top-down, because we + // Use reverse-postorder on the reverse CFG for bottom-up, because we // magically know that loops will be well behaved, i.e. they won't repeatedly - // call retain on a single pointer without doing a release. + // call retain on a single pointer without doing a release. We can't use + // ReversePostOrderTraversal here because we want to walk up from each + // function exit point. + SmallPtrSet<BasicBlock *, 16> Visited; + SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> Stack; + SmallVector<BasicBlock *, 16> Order; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *BB = I; + if (BB->getTerminator()->getNumSuccessors() == 0) + Stack.push_back(std::make_pair(BB, pred_begin(BB))); + } + while (!Stack.empty()) { + pred_iterator End = pred_end(Stack.back().first); + while (Stack.back().second != End) { + BasicBlock *BB = *Stack.back().second++; + if (Visited.insert(BB)) + Stack.push_back(std::make_pair(BB, pred_begin(BB))); + } + Order.push_back(Stack.pop_back_val().first); + } bool BottomUpNestingDetected = false; - SmallVector<BasicBlock *, 8> PostOrder; - for (po_iterator<Function *> I = po_begin(&F), E = po_end(&F); I != E; ++I) { + for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I = + Order.rbegin(), E = Order.rend(); I != E; ++I) { BasicBlock *BB = *I; - PostOrder.push_back(BB); - BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains); } - // Iterate through the post-order in reverse order, achieving a - // reverse-postorder traversal. We don't use the ReversePostOrderTraversal - // class here because it works by computing its own full postorder iteration, - // recording the sequence, and playing it back in reverse. Since we're already - // doing a full iteration above, we can just record the sequence manually and - // avoid the cost of having ReversePostOrderTraversal compute it. + // Use regular reverse-postorder for top-down. bool TopDownNestingDetected = false; - for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator - RI = PostOrder.rbegin(), RE = PostOrder.rend(); RI != RE; ++RI) - TopDownNestingDetected |= VisitTopDown(*RI, BBStates, Releases); + typedef ReversePostOrderTraversal<Function *> RPOTType; + RPOTType RPOT(&F); + for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { + BasicBlock *BB = *I; + TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases); + } return TopDownNestingDetected && BottomUpNestingDetected; } @@ -2565,12 +2714,10 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &ReleasesToMove, MapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases, - SmallVectorImpl<Instruction *> &DeadInsts) { + SmallVectorImpl<Instruction *> &DeadInsts, + Module *M) { Type *ArgTy = Arg->getType(); - Type *ParamTy = - (RetainRVFunc ? RetainRVFunc : - RetainFunc ? RetainFunc : - RetainBlockFunc)->arg_begin()->getType(); + Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext())); // Insert the new retain and release calls. for (SmallPtrSet<Instruction *, 2>::const_iterator @@ -2581,9 +2728,12 @@ void ObjCARCOpt::MoveCalls(Value *Arg, new BitCastInst(Arg, ParamTy, "", InsertPt); CallInst *Call = CallInst::Create(RetainsToMove.IsRetainBlock ? - RetainBlockFunc : RetainFunc, + getRetainBlockCallee(M) : getRetainCallee(M), MyArg, "", InsertPt); Call->setDoesNotThrow(); + if (RetainsToMove.CopyOnEscape) + Call->setMetadata(CopyOnEscapeMDKind, + MDNode::get(M->getContext(), ArrayRef<Value *>())); if (!RetainsToMove.IsRetainBlock) Call->setTailCall(); } @@ -2598,8 +2748,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg, // The invoke's return value isn't available in the unwind block, // but our releases will never depend on it, because they must be // paired with retains from before the invoke. - InsertPts[0] = II->getNormalDest()->getFirstNonPHI(); - InsertPts[1] = II->getUnwindDest()->getFirstNonPHI(); + InsertPts[0] = II->getNormalDest()->getFirstInsertionPt(); + InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt(); } else { // Insert code immediately after the last use. InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse)); @@ -2609,7 +2759,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg, Instruction *InsertPt = *I; Value *MyArg = ArgTy == ParamTy ? Arg : new BitCastInst(Arg, ParamTy, "", InsertPt); - CallInst *Call = CallInst::Create(ReleaseFunc, MyArg, "", InsertPt); + CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg, + "", InsertPt); // Attach a clang.imprecise_release metadata tag, if appropriate. if (MDNode *M = ReleasesToMove.ReleaseMetadata) Call->setMetadata(ImpreciseReleaseMDKind, M); @@ -2640,7 +2791,8 @@ bool ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates, MapVector<Value *, RRInfo> &Retains, - DenseMap<Value *, RRInfo> &Releases) { + DenseMap<Value *, RRInfo> &Releases, + Module *M) { bool AnyPairsCompletelyEliminated = false; RRInfo RetainsToMove; RRInfo ReleasesToMove; @@ -2649,21 +2801,37 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> SmallVector<Instruction *, 8> DeadInsts; for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(), - E = Retains.end(); I != E; ) { - Value *V = (I++)->first; + E = Retains.end(); I != E; ++I) { + Value *V = I->first; if (!V) continue; // blotted Instruction *Retain = cast<Instruction>(V); Value *Arg = GetObjCArg(Retain); - // If the object being released is in static or stack storage, we know it's + // If the object being released is in static storage, we know it's // not being managed by ObjC reference counting, so we can delete pairs // regardless of what possible decrements or uses lie between them. - bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg); + bool KnownSafe = isa<Constant>(Arg); + + // Same for stack storage, unless this is a non-copy-on-escape + // objc_retainBlock call, which is responsible for copying the block data + // from the stack to the heap. + if ((!I->second.IsRetainBlock || I->second.CopyOnEscape) && + isa<AllocaInst>(Arg)) + KnownSafe = true; + + // A constant pointer can't be pointing to an object on the heap. It may + // be reference-counted, but it won't be deleted. + if (const LoadInst *LI = dyn_cast<LoadInst>(Arg)) + if (const GlobalVariable *GV = + dyn_cast<GlobalVariable>( + StripPointerCastsAndObjCCalls(LI->getPointerOperand()))) + if (GV->isConstant()) + KnownSafe = true; // If a pair happens in a region where it is known that the reference count // is already incremented, we can similarly ignore possible decrements. - bool KnownIncrementedTD = true, KnownIncrementedBU = true; + bool KnownSafeTD = true, KnownSafeBU = true; // Connect the dots between the top-down-collected RetainsToMove and // bottom-up-collected ReleasesToMove to form sets of related calls. @@ -2683,7 +2851,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain); assert(It != Retains.end()); const RRInfo &NewRetainRRI = It->second; - KnownIncrementedTD &= NewRetainRRI.KnownIncremented; + KnownSafeTD &= NewRetainRRI.KnownSafe; for (SmallPtrSet<Instruction *, 2>::const_iterator LI = NewRetainRRI.Calls.begin(), LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) { @@ -2739,7 +2907,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> Releases.find(NewRelease); assert(It != Releases.end()); const RRInfo &NewReleaseRRI = It->second; - KnownIncrementedBU &= NewReleaseRRI.KnownIncremented; + KnownSafeBU &= NewReleaseRRI.KnownSafe; for (SmallPtrSet<Instruction *, 2>::const_iterator LI = NewReleaseRRI.Calls.begin(), LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) { @@ -2759,6 +2927,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> // Merge the IsRetainBlock values. if (FirstRetain) { RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock; + RetainsToMove.CopyOnEscape = NewReleaseRetainRRI.CopyOnEscape; FirstRetain = false; } else if (ReleasesToMove.IsRetainBlock != NewReleaseRetainRRI.IsRetainBlock) @@ -2766,6 +2935,9 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> // objc_retain and the other uses objc_retainBlock. goto next_retain; + // Merge the CopyOnEscape values. + RetainsToMove.CopyOnEscape &= NewReleaseRetainRRI.CopyOnEscape; + // Collect the optimal insertion points. if (!KnownSafe) for (SmallPtrSet<Instruction *, 2>::const_iterator @@ -2787,12 +2959,19 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> if (NewRetains.empty()) break; } - // If the pointer is known incremented, we can safely delete the pair - // regardless of what's between them. - if (KnownIncrementedTD || KnownIncrementedBU) { + // If the pointer is known incremented or nested, we can safely delete the + // pair regardless of what's between them. + if (KnownSafeTD || KnownSafeBU) { RetainsToMove.ReverseInsertPts.clear(); ReleasesToMove.ReverseInsertPts.clear(); NewCount = 0; + } else { + // Determine whether the new insertion points we computed preserve the + // balance of retain and release calls through the program. + // TODO: If the fully aggressive solution isn't valid, try to find a + // less aggressive solution which is. + if (NewDelta != 0) + goto next_retain; } // Determine whether the original call points are balanced in the retain and @@ -2803,18 +2982,12 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> if (OldDelta != 0) goto next_retain; - // Determine whether the new insertion points we computed preserve the - // balance of retain and release calls through the program. - // TODO: If the fully aggressive solution isn't valid, try to find a - // less aggressive solution which is. - if (NewDelta != 0) - goto next_retain; - // Ok, everything checks out and we're all set. Let's move some code! Changed = true; AnyPairsCompletelyEliminated = NewCount == 0; NumRRs += OldCount - NewCount; - MoveCalls(Arg, RetainsToMove, ReleasesToMove, Retains, Releases, DeadInsts); + MoveCalls(Arg, RetainsToMove, ReleasesToMove, + Retains, Releases, DeadInsts, M); next_retain: NewReleases.clear(); @@ -2993,7 +3166,8 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) { bool NestingDetected = Visit(F, BBStates, Retains, Releases); // Transform. - return PerformCodePlacement(BBStates, Retains, Releases) && NestingDetected; + return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) && + NestingDetected; } /// OptimizeReturns - Look for this pattern: @@ -3072,7 +3246,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { // Check that there is nothing that can affect the reference // count between the retain and the call. - FindDependencies(CanChangeRetainCount, Arg, BB, Retain, + // Note that Retain need not be in BB. + FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain, DependingInstructions, Visited, PA); if (DependingInstructions.size() != 1) goto next_block; @@ -3116,12 +3291,8 @@ bool ObjCARCOpt::doInitialization(Module &M) { // Identify the imprecise release metadata kind. ImpreciseReleaseMDKind = M.getContext().getMDKindID("clang.imprecise_release"); - - // Identify the declarations for objc_retain and friends. - RetainFunc = M.getFunction("objc_retain"); - RetainBlockFunc = M.getFunction("objc_retainBlock"); - RetainRVFunc = M.getFunction("objc_retainAutoreleasedReturnValue"); - ReleaseFunc = M.getFunction("objc_release"); + CopyOnEscapeMDKind = + M.getContext().getMDKindID("clang.arc.copy_on_escape"); // Intuitively, objc_retain and others are nocapture, however in practice // they are not, because they return their argument value. And objc_release @@ -3132,6 +3303,7 @@ bool ObjCARCOpt::doInitialization(Module &M) { AutoreleaseRVCallee = 0; ReleaseCallee = 0; RetainCallee = 0; + RetainBlockCallee = 0; AutoreleaseCallee = 0; return false; @@ -3377,7 +3549,7 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, void ObjCARCContract::ContractRelease(Instruction *Release, inst_iterator &Iter) { LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release)); - if (!Load || Load->isVolatile()) return; + if (!Load || !Load->isSimple()) return; // For now, require everything to be in one basic block. BasicBlock *BB = Release->getParent(); @@ -3393,7 +3565,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release, !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod))) ++I; StoreInst *Store = dyn_cast<StoreInst>(I); - if (!Store || Store->isVolatile()) return; + if (!Store || !Store->isSimple()) return; if (Store->getPointerOperand() != Loc.Ptr) return; Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand()); diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index e6341ae..8f98a5b 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -309,7 +309,7 @@ void Reassociate::LinearizeExprTree(BinaryOperator *I, std::swap(LHS, RHS); bool Success = !I->swapOperands(); assert(Success && "swapOperands failed"); - Success = false; + (void)Success; MadeChange = true; } else if (RHSBO) { // Turn (A+B)+(C+D) -> (((A+B)+C)+D). This guarantees the RHS is not diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 5b12c92..196a847 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -156,7 +156,7 @@ namespace { /// class SCCPSolver : public InstVisitor<SCCPSolver> { const TargetData *TD; - SmallPtrSet<BasicBlock*, 8> BBExecutable;// The BBs that are executable. + SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. /// StructValueState - This maintains ValueState for values that have @@ -471,9 +471,9 @@ private: /// UsersOfOverdefinedPHIs map for PN, remove them now. void RemoveFromOverdefinedPHIs(Instruction *I, PHINode *PN) { if (UsersOfOverdefinedPHIs.empty()) return; - std::multimap<PHINode*, Instruction*>::iterator It, E; - tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN); - while (It != E) { + typedef std::multimap<PHINode*, Instruction*>::iterator ItTy; + std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN); + for (ItTy It = Range.first, E = Range.second; It != E;) { if (It->second == I) UsersOfOverdefinedPHIs.erase(It++); else @@ -486,9 +486,9 @@ private: /// (Duplicate entries do not break anything directly, but can lead to /// exponential growth of the table in rare cases.) void InsertInOverdefinedPHIs(Instruction *I, PHINode *PN) { - std::multimap<PHINode*, Instruction*>::iterator J, E; - tie(J, E) = UsersOfOverdefinedPHIs.equal_range(PN); - for (; J != E; ++J) + typedef std::multimap<PHINode*, Instruction*>::iterator ItTy; + std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN); + for (ItTy J = Range.first, E = Range.second; J != E; ++J) if (J->second == I) return; UsersOfOverdefinedPHIs.insert(std::make_pair(PN, I)); @@ -515,6 +515,7 @@ private: void visitShuffleVectorInst(ShuffleVectorInst &I); void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); + void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); } // Instructions that cannot be folded away. void visitStoreInst (StoreInst &I); @@ -528,8 +529,12 @@ private: visitTerminatorInst(II); } void visitCallSite (CallSite CS); + void visitResumeInst (TerminatorInst &I) { /*returns void*/ } void visitUnwindInst (TerminatorInst &I) { /*returns void*/ } void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } + void visitFenceInst (FenceInst &I) { /*returns void*/ } + void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); } + void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); } void visitAllocaInst (Instruction &I) { markOverdefined(&I); } void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); } @@ -577,6 +582,10 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, } if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) { + if (TI.getNumSuccessors() < 2) { + Succs[0] = true; + return; + } LatticeVal SCValue = getValueState(SI->getCondition()); ConstantInt *CI = SCValue.getConstantInt(); @@ -637,6 +646,9 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { return true; if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (SI->getNumSuccessors() < 2) + return true; + LatticeVal SCValue = getValueState(SI->getCondition()); ConstantInt *CI = SCValue.getConstantInt(); @@ -692,13 +704,14 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // There may be instructions using this PHI node that are not overdefined // themselves. If so, make sure that they know that the PHI node operand // changed. - std::multimap<PHINode*, Instruction*>::iterator I, E; - tie(I, E) = UsersOfOverdefinedPHIs.equal_range(&PN); - if (I == E) + typedef std::multimap<PHINode*, Instruction*>::iterator ItTy; + std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(&PN); + + if (Range.first == Range.second) return; SmallVector<Instruction*, 16> Users; - for (; I != E; ++I) + for (ItTy I = Range.first, E = Range.second; I != E; ++I) Users.push_back(I->second); while (!Users.empty()) visit(Users.pop_back_val()); @@ -1179,8 +1192,8 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { } Constant *Ptr = Operands[0]; - markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, &Operands[0]+1, - Operands.size()-1)); + ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end()); + markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices)); } void SCCPSolver::visitStoreInst(StoreInst &SI) { @@ -1420,66 +1433,115 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { if (I->getType()->isVoidTy()) continue; if (StructType *STy = dyn_cast<StructType>(I->getType())) { - // Only a few things that can be structs matter for undef. Just send - // all their results to overdefined. We could be more precise than this - // but it isn't worth bothering. - if (isa<CallInst>(I) || isa<SelectInst>(I)) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - LatticeVal &LV = getStructValueState(I, i); - if (LV.isUndefined()) - markOverdefined(LV, I); - } + // Only a few things that can be structs matter for undef. + + // Tracked calls must never be marked overdefined in ResolvedUndefsIn. + if (CallSite CS = CallSite(I)) + if (Function *F = CS.getCalledFunction()) + if (MRVFunctionsTracked.count(F)) + continue; + + // extractvalue and insertvalue don't need to be marked; they are + // tracked as precisely as their operands. + if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I)) + continue; + + // Send the results of everything else to overdefined. We could be + // more precise than this but it isn't worth bothering. + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + LatticeVal &LV = getStructValueState(I, i); + if (LV.isUndefined()) + markOverdefined(LV, I); } continue; } - + LatticeVal &LV = getValueState(I); if (!LV.isUndefined()) continue; - // No instructions using structs need disambiguation. - if (I->getOperand(0)->getType()->isStructTy()) + // extractvalue is safe; check here because the argument is a struct. + if (isa<ExtractValueInst>(I)) continue; - // Get the lattice values of the first two operands for use below. + // Compute the operand LatticeVals, for convenience below. + // Anything taking a struct is conservatively assumed to require + // overdefined markings. + if (I->getOperand(0)->getType()->isStructTy()) { + markOverdefined(I); + return true; + } LatticeVal Op0LV = getValueState(I->getOperand(0)); LatticeVal Op1LV; if (I->getNumOperands() == 2) { - // No instructions using structs need disambiguation. - if (I->getOperand(1)->getType()->isStructTy()) - continue; - - // If this is a two-operand instruction, and if both operands are - // undefs, the result stays undef. + if (I->getOperand(1)->getType()->isStructTy()) { + markOverdefined(I); + return true; + } + Op1LV = getValueState(I->getOperand(1)); - if (Op0LV.isUndefined() && Op1LV.isUndefined()) - continue; } - // If this is an instructions whose result is defined even if the input is // not fully defined, propagate the information. Type *ITy = I->getType(); switch (I->getOpcode()) { - default: break; // Leave the instruction as an undef. + case Instruction::Add: + case Instruction::Sub: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + break; // Any undef -> undef + case Instruction::FSub: + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + // Floating-point binary operation: be conservative. + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + markForcedConstant(I, Constant::getNullValue(ITy)); + else + markOverdefined(I); + return true; case Instruction::ZExt: - // After a zero extend, we know the top part is zero. SExt doesn't have - // to be handled here, because we don't know whether the top part is 1's - // or 0's. - case Instruction::SIToFP: // some FP values are not possible, just use 0. - case Instruction::UIToFP: // some FP values are not possible, just use 0. + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + // undef -> 0; some outputs are impossible markForcedConstant(I, Constant::getNullValue(ITy)); return true; case Instruction::Mul: case Instruction::And: + // Both operands undef -> undef + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + break; // undef * X -> 0. X could be zero. // undef & X -> 0. X could be zero. markForcedConstant(I, Constant::getNullValue(ITy)); return true; case Instruction::Or: + // Both operands undef -> undef + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + break; // undef | X -> -1. X could be -1. markForcedConstant(I, Constant::getAllOnesValue(ITy)); return true; + case Instruction::Xor: + // undef ^ undef -> 0; strictly speaking, this is not strictly + // necessary, but we try to be nice to people who expect this + // behavior in simple cases + if (Op0LV.isUndefined() && Op1LV.isUndefined()) { + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + } + // undef ^ X -> undef + break; + case Instruction::SDiv: case Instruction::UDiv: case Instruction::SRem: @@ -1494,26 +1556,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { return true; case Instruction::AShr: - // undef >>s X -> undef. No change. - if (Op0LV.isUndefined()) break; - - // X >>s undef -> X. X could be 0, X could have the high-bit known set. - if (Op0LV.isConstant()) - markForcedConstant(I, Op0LV.getConstant()); - else - markOverdefined(I); + // X >>a undef -> undef. + if (Op1LV.isUndefined()) break; + + // undef >>a X -> all ones + markForcedConstant(I, Constant::getAllOnesValue(ITy)); return true; case Instruction::LShr: case Instruction::Shl: - // undef >> X -> undef. No change. - // undef << X -> undef. No change. - if (Op0LV.isUndefined()) break; - - // X >> undef -> 0. X could be 0. - // X << undef -> 0. X could be 0. + // X << undef -> undef. + // X >> undef -> undef. + if (Op1LV.isUndefined()) break; + + // undef << X -> 0 + // undef >> X -> 0 markForcedConstant(I, Constant::getNullValue(ITy)); return true; case Instruction::Select: + Op1LV = getValueState(I->getOperand(1)); // undef ? X : Y -> X or Y. There could be commonality between X/Y. if (Op0LV.isUndefined()) { if (!Op1LV.isConstant()) // Pick the constant one if there is any. @@ -1533,9 +1593,35 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { else markOverdefined(I); return true; + case Instruction::Load: + // A load here means one of two things: a load of undef from a global, + // a load from an unknown pointer. Either way, having it return undef + // is okay. + break; + case Instruction::ICmp: + // X == undef -> undef. Other comparisons get more complicated. + if (cast<ICmpInst>(I)->isEquality()) + break; + markOverdefined(I); + return true; case Instruction::Call: - // If a call has an undef result, it is because it is constant foldable - // but one of the inputs was undef. Just force the result to + case Instruction::Invoke: { + // There are two reasons a call can have an undef result + // 1. It could be tracked. + // 2. It could be constant-foldable. + // Because of the way we solve return values, tracked calls must + // never be marked overdefined in ResolvedUndefsIn. + if (Function *F = CallSite(I).getCalledFunction()) + if (TrackedRetVals.count(F)) + break; + + // If the call is constant-foldable, we mark it overdefined because + // we do not know what return values are valid. + markOverdefined(I); + return true; + } + default: + // If we don't know what should happen here, conservatively mark it // overdefined. markOverdefined(I); return true; @@ -1621,15 +1707,25 @@ FunctionPass *llvm::createSCCPPass() { static void DeleteInstructionInBlock(BasicBlock *BB) { DEBUG(dbgs() << " BasicBlock Dead:" << *BB); ++NumDeadBlocks; - - // Delete the instructions backwards, as it has a reduced likelihood of - // having to update as many def-use and use-def chains. - while (!isa<TerminatorInst>(BB->begin())) { - Instruction *I = --BasicBlock::iterator(BB->getTerminator()); - - if (!I->use_empty()) - I->replaceAllUsesWith(UndefValue::get(I->getType())); - BB->getInstList().erase(I); + + // Check to see if there are non-terminating instructions to delete. + if (isa<TerminatorInst>(BB->begin())) + return; + + // Delete the instructions backwards, as it has a reduced likelihood of having + // to update as many def-use and use-def chains. + Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. + while (EndInst != BB->begin()) { + // Delete the next to last instruction. + BasicBlock::iterator I = EndInst; + Instruction *Inst = --I; + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); + if (isa<LandingPadInst>(Inst)) { + EndInst = Inst; + continue; + } + BB->getInstList().erase(Inst); ++NumInstRemoved; } } diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 302c287..f6918de 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -63,7 +63,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeCFGSimplifyPassPass(Registry); initializeSimplifyLibCallsPass(Registry); initializeSinkingPass(Registry); - initializeTailDupPass(Registry); initializeTailCallElimPass(Registry); } @@ -187,3 +186,7 @@ void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBasicAliasAnalysisPass()); } + +void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLowerExpectIntrinsicPass()); +} diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index fbf3092..c6d9123 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -145,6 +145,9 @@ namespace { SmallVector<AllocaInst*, 32> &NewElts); void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVector<AllocaInst*, 32> &NewElts); + void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, + uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts); void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, AllocaInst *AI, SmallVector<AllocaInst*, 32> &NewElts); @@ -295,8 +298,6 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { if (ScalarKind == Unknown) ScalarKind = Integer; - // FIXME: It should be possible to promote the vector type up to the alloca's - // size. if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8) ScalarKind = Integer; @@ -331,16 +332,12 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { /// (VectorTy) so far at the offset specified by Offset (which is specified in /// bytes). /// -/// There are three cases we handle here: +/// There are two cases we handle here: /// 1) A union of vector types of the same size and potentially its elements. /// Here we turn element accesses into insert/extract element operations. /// This promotes a <4 x float> with a store of float to the third element /// into a <4 x float> that uses insert element. -/// 2) A union of vector types with power-of-2 size differences, e.g. a float, -/// <2 x float> and <4 x float>. Here we turn element accesses into insert -/// and extract element operations, and <2 x float> accesses into a cast to -/// <2 x double>, an extract, and a cast back to <2 x float>. -/// 3) A fully general blob of memory, which we turn into some (potentially +/// 2) A fully general blob of memory, which we turn into some (potentially /// large) integer type with extract and insert operations where the loads /// and stores would mutate the memory. We mark this by setting VectorTy /// to VoidTy. @@ -371,20 +368,13 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In, // if the implied vector agrees with what we already have and if Offset is // compatible with it. if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && - (!VectorTy || Offset * 8 < VectorTy->getPrimitiveSizeInBits())) { + (!VectorTy || EltSize == VectorTy->getElementType() + ->getPrimitiveSizeInBits()/8)) { if (!VectorTy) { ScalarKind = ImplicitVector; VectorTy = VectorType::get(In, AllocaSize/EltSize); - return; } - - unsigned CurrentEltSize = VectorTy->getElementType() - ->getPrimitiveSizeInBits()/8; - if (EltSize == CurrentEltSize) - return; - - if (In->isIntegerTy() && isPowerOf2_32(AllocaSize / EltSize)) - return; + return; } } @@ -397,72 +387,19 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In, /// returning true if the type was successfully merged and false otherwise. bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, uint64_t Offset) { - // TODO: Support nonzero offsets? - if (Offset != 0) - return false; - - // Only allow vectors that are a power-of-2 away from the size of the alloca. - if (!isPowerOf2_64(AllocaSize / (VInTy->getBitWidth() / 8))) - return false; - - // If this the first vector we see, remember the type so that we know the - // element size. - if (!VectorTy) { + if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { + // If we're storing/loading a vector of the right size, allow it as a + // vector. If this the first vector we see, remember the type so that + // we know the element size. If this is a subsequent access, ignore it + // even if it is a differing type but the same size. Worst case we can + // bitcast the resultant vectors. + if (!VectorTy) + VectorTy = VInTy; ScalarKind = Vector; - VectorTy = VInTy; return true; } - unsigned BitWidth = VectorTy->getBitWidth(); - unsigned InBitWidth = VInTy->getBitWidth(); - - // Vectors of the same size can be converted using a simple bitcast. - if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8)) { - ScalarKind = Vector; - return true; - } - - Type *ElementTy = VectorTy->getElementType(); - Type *InElementTy = VInTy->getElementType(); - - // Do not allow mixed integer and floating-point accesses from vectors of - // different sizes. - if (ElementTy->isFloatingPointTy() != InElementTy->isFloatingPointTy()) - return false; - - if (ElementTy->isFloatingPointTy()) { - // Only allow floating-point vectors of different sizes if they have the - // same element type. - // TODO: This could be loosened a bit, but would anything benefit? - if (ElementTy != InElementTy) - return false; - - // There are no arbitrary-precision floating-point types, which limits the - // number of legal vector types with larger element types that we can form - // to bitcast and extract a subvector. - // TODO: We could support some more cases with mixed fp128 and double here. - if (!(BitWidth == 64 || BitWidth == 128) || - !(InBitWidth == 64 || InBitWidth == 128)) - return false; - } else { - assert(ElementTy->isIntegerTy() && "Vector elements must be either integer " - "or floating-point."); - unsigned BitWidth = ElementTy->getPrimitiveSizeInBits(); - unsigned InBitWidth = InElementTy->getPrimitiveSizeInBits(); - - // Do not allow integer types smaller than a byte or types whose widths are - // not a multiple of a byte. - if (BitWidth < 8 || InBitWidth < 8 || - BitWidth % 8 != 0 || InBitWidth % 8 != 0) - return false; - } - - // Pick the largest of the two vector types. - ScalarKind = Vector; - if (InBitWidth > BitWidth) - VectorTy = VInTy; - - return true; + return false; } /// CanConvertToScalar - V is a pointer. If we can convert the pointee and all @@ -480,7 +417,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { if (LoadInst *LI = dyn_cast<LoadInst>(User)) { // Don't break volatile loads. - if (LI->isVolatile()) + if (!LI->isSimple()) return false; // Don't touch MMX operations. if (LI->getType()->isX86_MMXTy()) @@ -492,7 +429,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Storing the pointer, not into the value? - if (SI->getOperand(0) == V || SI->isVolatile()) return false; + if (SI->getOperand(0) == V || !SI->isSimple()) return false; // Don't touch MMX operations. if (SI->getOperand(0)->getType()->isX86_MMXTy()) return false; @@ -502,7 +439,8 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { } if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { - IsNotTrivial = true; // Can't be mem2reg'd. + if (!onlyUsedByLifetimeMarkers(BCI)) + IsNotTrivial = true; // Can't be mem2reg'd. if (!CanConvertToScalar(BCI, Offset)) return false; continue; @@ -560,6 +498,14 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { continue; } + // If this is a lifetime intrinsic, we can handle it. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + continue; + } + } + // Otherwise, we cannot handle this! return false; } @@ -599,7 +545,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, if (LoadInst *LI = dyn_cast<LoadInst>(User)) { // The load is a bit extract from NewAI shifted right by Offset bits. - Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp"); + Value *LoadedVal = Builder.CreateLoad(NewAI); Value *NewLoadVal = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder); LI->replaceAllUsesWith(NewLoadVal); @@ -703,65 +649,18 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, continue; } - llvm_unreachable("Unsupported operation!"); - } -} + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + // There's no need to preserve these, as the resulting alloca will be + // converted to a register anyways. + II->eraseFromParent(); + continue; + } + } -/// getScaledElementType - Gets a scaled element type for a partial vector -/// access of an alloca. The input types must be integer or floating-point -/// scalar or vector types, and the resulting type is an integer, float or -/// double. -static Type *getScaledElementType(Type *Ty1, Type *Ty2, - unsigned NewBitWidth) { - bool IsFP1 = Ty1->isFloatingPointTy() || - (Ty1->isVectorTy() && - cast<VectorType>(Ty1)->getElementType()->isFloatingPointTy()); - bool IsFP2 = Ty2->isFloatingPointTy() || - (Ty2->isVectorTy() && - cast<VectorType>(Ty2)->getElementType()->isFloatingPointTy()); - - LLVMContext &Context = Ty1->getContext(); - - // Prefer floating-point types over integer types, as integer types may have - // been created by earlier scalar replacement. - if (IsFP1 || IsFP2) { - if (NewBitWidth == 32) - return Type::getFloatTy(Context); - if (NewBitWidth == 64) - return Type::getDoubleTy(Context); + llvm_unreachable("Unsupported operation!"); } - - return Type::getIntNTy(Context, NewBitWidth); -} - -/// CreateShuffleVectorCast - Creates a shuffle vector to convert one vector -/// to another vector of the same element type which has the same allocation -/// size but different primitive sizes (e.g. <3 x i32> and <4 x i32>). -static Value *CreateShuffleVectorCast(Value *FromVal, Type *ToType, - IRBuilder<> &Builder) { - Type *FromType = FromVal->getType(); - VectorType *FromVTy = cast<VectorType>(FromType); - VectorType *ToVTy = cast<VectorType>(ToType); - assert((ToVTy->getElementType() == FromVTy->getElementType()) && - "Vectors must have the same element type"); - Value *UnV = UndefValue::get(FromType); - unsigned numEltsFrom = FromVTy->getNumElements(); - unsigned numEltsTo = ToVTy->getNumElements(); - - SmallVector<Constant*, 3> Args; - Type* Int32Ty = Builder.getInt32Ty(); - unsigned minNumElts = std::min(numEltsFrom, numEltsTo); - unsigned i; - for (i=0; i != minNumElts; ++i) - Args.push_back(ConstantInt::get(Int32Ty, i)); - - if (i < numEltsTo) { - Constant* UnC = UndefValue::get(Int32Ty); - for (; i != numEltsTo; ++i) - Args.push_back(UnC); - } - Constant *Mask = ConstantVector::get(Args); - return Builder.CreateShuffleVector(FromVal, UnV, Mask, "tmpV"); } /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer @@ -787,38 +686,8 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, if (VectorType *VTy = dyn_cast<VectorType>(FromType)) { unsigned FromTypeSize = TD.getTypeAllocSize(FromType); unsigned ToTypeSize = TD.getTypeAllocSize(ToType); - if (FromTypeSize == ToTypeSize) { - // If the two types have the same primitive size, use a bit cast. - // Otherwise, it is two vectors with the same element type that has - // the same allocation size but different number of elements so use - // a shuffle vector. - if (FromType->getPrimitiveSizeInBits() == - ToType->getPrimitiveSizeInBits()) - return Builder.CreateBitCast(FromVal, ToType, "tmp"); - else - return CreateShuffleVectorCast(FromVal, ToType, Builder); - } - - if (isPowerOf2_64(FromTypeSize / ToTypeSize)) { - assert(!(ToType->isVectorTy() && Offset != 0) && "Can't extract a value " - "of a smaller vector type at a nonzero offset."); - - Type *CastElementTy = getScaledElementType(FromType, ToType, - ToTypeSize * 8); - unsigned NumCastVectorElements = FromTypeSize / ToTypeSize; - - LLVMContext &Context = FromVal->getContext(); - Type *CastTy = VectorType::get(CastElementTy, - NumCastVectorElements); - Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp"); - - unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy); - unsigned Elt = Offset/EltSize; - assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); - Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get( - Type::getInt32Ty(Context), Elt), "tmp"); - return Builder.CreateBitCast(Extract, ToType, "tmp"); - } + if (FromTypeSize == ToTypeSize) + return Builder.CreateBitCast(FromVal, ToType); // Otherwise it must be an element access. unsigned Elt = 0; @@ -828,10 +697,9 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); } // Return the element extracted out of it. - Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get( - Type::getInt32Ty(FromVal->getContext()), Elt), "tmp"); + Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt)); if (V->getType() != ToType) - V = Builder.CreateBitCast(V, ToType, "tmp"); + V = Builder.CreateBitCast(V, ToType); return V; } @@ -844,7 +712,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), Offset+Layout.getElementOffsetInBits(i), Builder); - Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } @@ -855,7 +723,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), Offset+i*EltSize, Builder); - Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } @@ -881,33 +749,31 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, // only some bits are used. if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) FromVal = Builder.CreateLShr(FromVal, - ConstantInt::get(FromVal->getType(), - ShAmt), "tmp"); + ConstantInt::get(FromVal->getType(), ShAmt)); else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) FromVal = Builder.CreateShl(FromVal, - ConstantInt::get(FromVal->getType(), - -ShAmt), "tmp"); + ConstantInt::get(FromVal->getType(), -ShAmt)); // Finally, unconditionally truncate the integer to the right width. unsigned LIBitWidth = TD.getTypeSizeInBits(ToType); if (LIBitWidth < NTy->getBitWidth()) FromVal = Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth), "tmp"); + LIBitWidth)); else if (LIBitWidth > NTy->getBitWidth()) FromVal = Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth), "tmp"); + LIBitWidth)); // If the result is an integer, this is a trunc or bitcast. if (ToType->isIntegerTy()) { // Should be done. } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { // Just do a bitcast, we know the sizes match up. - FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp"); + FromVal = Builder.CreateBitCast(FromVal, ToType); } else { // Otherwise must be a pointer. - FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp"); + FromVal = Builder.CreateIntToPtr(FromVal, ToType); } assert(FromVal->getType() == ToType && "Didn't convert right?"); return FromVal; @@ -936,56 +802,21 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, // Changing the whole vector with memset or with an access of a different // vector type? - if (ValSize == VecSize) { - // If the two types have the same primitive size, use a bit cast. - // Otherwise, it is two vectors with the same element type that has - // the same allocation size but different number of elements so use - // a shuffle vector. - if (VTy->getPrimitiveSizeInBits() == - SV->getType()->getPrimitiveSizeInBits()) - return Builder.CreateBitCast(SV, AllocaType, "tmp"); - else - return CreateShuffleVectorCast(SV, VTy, Builder); - } - - if (isPowerOf2_64(VecSize / ValSize)) { - assert(!(SV->getType()->isVectorTy() && Offset != 0) && "Can't insert a " - "value of a smaller vector type at a nonzero offset."); - - Type *CastElementTy = getScaledElementType(VTy, SV->getType(), - ValSize); - unsigned NumCastVectorElements = VecSize / ValSize; - - LLVMContext &Context = SV->getContext(); - Type *OldCastTy = VectorType::get(CastElementTy, - NumCastVectorElements); - Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp"); - - Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp"); - - unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy); - unsigned Elt = Offset/EltSize; - assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); - Value *Insert = - Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get( - Type::getInt32Ty(Context), Elt), "tmp"); - return Builder.CreateBitCast(Insert, AllocaType, "tmp"); - } + if (ValSize == VecSize) + return Builder.CreateBitCast(SV, AllocaType); // Must be an element insertion. assert(SV->getType() == VTy->getElementType()); uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); unsigned Elt = Offset/EltSize; - return Builder.CreateInsertElement(Old, SV, - ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt), - "tmp"); + return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt)); } // If SV is a first-class aggregate value, insert each value recursively. if (StructType *ST = dyn_cast<StructType>(SV->getType())) { const StructLayout &Layout = *TD.getStructLayout(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+Layout.getElementOffsetInBits(i), Builder); @@ -996,7 +827,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder); } return Old; @@ -1009,20 +840,19 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType()); unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType); if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) - SV = Builder.CreateBitCast(SV, - IntegerType::get(SV->getContext(),SrcWidth), "tmp"); + SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp"); + SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext())); // Zero extend or truncate the value if needed. if (SV->getType() != AllocaType) { if (SV->getType()->getPrimitiveSizeInBits() < AllocaType->getPrimitiveSizeInBits()) - SV = Builder.CreateZExt(SV, AllocaType, "tmp"); + SV = Builder.CreateZExt(SV, AllocaType); else { // Truncation may be needed if storing more than the alloca can hold // (undefined behavior). - SV = Builder.CreateTrunc(SV, AllocaType, "tmp"); + SV = Builder.CreateTrunc(SV, AllocaType); SrcWidth = DestWidth; SrcStoreWidth = DestStoreWidth; } @@ -1045,12 +875,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, // only some bits in the structure are set. APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { - SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), - ShAmt), "tmp"); + SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt)); Mask <<= ShAmt; } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { - SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), - -ShAmt), "tmp"); + SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt)); Mask = Mask.lshr(-ShAmt); } @@ -1196,7 +1024,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end(); UI != UE; ++UI) { LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || LI->isVolatile()) return false; + if (LI == 0 || !LI->isSimple()) return false; // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. @@ -1237,7 +1065,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end(); UI != UE; ++UI) { LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || LI->isVolatile()) return false; + if (LI == 0 || !LI->isSimple()) return false; // For now we only allow loads in the same block as the PHI. This is a // common case that happens when instcombine merges two loads through a PHI. @@ -1258,17 +1086,21 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { // trapping load in the predecessor if it is a critical edge. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); + Value *InVal = PN->getIncomingValue(i); + + // If the terminator of the predecessor has side-effects (an invoke), + // there is no safe place to put a load in the predecessor. + if (Pred->getTerminator()->mayHaveSideEffects()) + return false; + + // If the value is produced by the terminator of the predecessor + // (an invoke), there is no valid place to put a load in the predecessor. + if (Pred->getTerminator() == InVal) + return false; // If the predecessor has a single successor, then the edge isn't critical. if (Pred->getTerminator()->getNumSuccessors() == 1) continue; - - Value *InVal = PN->getIncomingValue(i); - - // If the InVal is an invoke in the pred, we can't put a load on the edge. - if (InvokeInst *II = dyn_cast<InvokeInst>(InVal)) - if (II->getParent() == Pred) - return false; // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. @@ -1295,13 +1127,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { UI != UE; ++UI) { User *U = *UI; if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - if (LI->isVolatile()) + if (!LI->isSimple()) return false; continue; } if (StoreInst *SI = dyn_cast<StoreInst>(U)) { - if (SI->getOperand(0) == AI || SI->isVolatile()) + if (SI->getOperand(0) == AI || !SI->isSimple()) return false; // Don't allow a store OF the AI, only INTO the AI. continue; } @@ -1343,6 +1175,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { continue; } + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { + if (onlyUsedByLifetimeMarkers(BCI)) { + InstsToRewrite.insert(BCI); + continue; + } + } + return false; } @@ -1354,6 +1193,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { // If we have instructions that need to be rewritten for this to be promotable // take care of it now. for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { + if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) { + // This could only be a bitcast used by nothing but lifetime intrinsics. + for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end(); + I != E;) { + Use &U = I.getUse(); + ++I; + cast<Instruction>(U.getUser())->eraseFromParent(); + } + BCI->eraseFromParent(); + continue; + } + if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) { // Selects in InstsToRewrite only have load uses. Rewrite each as two // loads with a new select. @@ -1670,7 +1521,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, UI.getOperandNo() == 0, Info, MI, true /*AllowWholeAccess*/); } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - if (LI->isVolatile()) + if (!LI->isSimple()) return MarkUnsafe(Info, User); Type *LIType = LI->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), @@ -1679,13 +1530,17 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Store is ok if storing INTO the pointer, not storing the pointer - if (SI->isVolatile() || SI->getOperand(0) == I) + if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), SIType, true, Info, SI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return MarkUnsafe(Info, User); } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { isSafePHISelectUseForScalarRepl(User, Offset, Info); } else { @@ -1725,7 +1580,7 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, return MarkUnsafe(Info, User); isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - if (LI->isVolatile()) + if (!LI->isSimple()) return MarkUnsafe(Info, User); Type *LIType = LI->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), @@ -1734,7 +1589,7 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Store is ok if storing INTO the pointer, not storing the pointer - if (SI->isVolatile() || SI->getOperand(0) == I) + if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); Type *SIType = SI->getOperand(0)->getType(); @@ -1923,6 +1778,14 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, // address operand will be updated, so nothing else needs to be done. continue; } + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + RewriteLifetimeIntrinsic(II, AI, Offset, NewElts); + } + continue; + } if (LoadInst *LI = dyn_cast<LoadInst>(User)) { Type *LIType = LI->getType(); @@ -2080,8 +1943,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, } Instruction *Val = NewElts[Idx]; if (NewArgs.size() > 1) { - Val = GetElementPtrInst::CreateInBounds(Val, NewArgs.begin(), - NewArgs.end(), "", GEPI); + Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI); Val->takeName(GEPI); } if (Val->getType() != GEPI->getType()) @@ -2090,6 +1952,62 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, DeadInsts.push_back(GEPI); } +/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it +/// to mark the lifetime of the scalarized memory. +void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, + uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts) { + ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0)); + // Put matching lifetime markers on everything from Offset up to + // Offset+OldSize. + Type *AIType = AI->getAllocatedType(); + uint64_t NewOffset = Offset; + Type *IdxTy; + uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy); + + IRBuilder<> Builder(II); + uint64_t Size = OldSize->getLimitedValue(); + + if (NewOffset) { + // Splice the first element and index 'NewOffset' bytes in. SROA will + // split the alloca again later. + Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy()); + V = Builder.CreateGEP(V, Builder.getInt64(NewOffset)); + + IdxTy = NewElts[Idx]->getAllocatedType(); + uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset; + if (EltSize > Size) { + EltSize = Size; + Size = 0; + } else { + Size -= EltSize; + } + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize)); + else + Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize)); + ++Idx; + } + + for (; Idx != NewElts.size() && Size; ++Idx) { + IdxTy = NewElts[Idx]->getAllocatedType(); + uint64_t EltSize = TD->getTypeAllocSize(IdxTy); + if (EltSize > Size) { + EltSize = Size; + Size = 0; + } else { + Size -= EltSize; + } + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + Builder.CreateLifetimeStart(NewElts[Idx], + Builder.getInt64(EltSize)); + else + Builder.CreateLifetimeEnd(NewElts[Idx], + Builder.getInt64(EltSize)); + } + DeadInsts.push_back(II); +} + /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI. /// Rewrite it to copy or set the elements of the scalarized memory. void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, @@ -2157,7 +2075,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, if (OtherPtr) { Value *Idx[2] = { Zero, ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) }; - OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, Idx + 2, + OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, OtherPtr->getName()+"."+Twine(i), MI); uint64_t EltOffset; @@ -2226,8 +2144,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, assert(StoreVal->getType() == ValTy && "Type mismatch!"); // If the requested value was a vector constant, create it. - if (EltTy != ValTy) { - unsigned NumElts = cast<VectorType>(ValTy)->getNumElements(); + if (EltTy->isVectorTy()) { + unsigned NumElts = cast<VectorType>(EltTy)->getNumElements(); SmallVector<Constant*, 16> Elts(NumElts, StoreVal); StoreVal = ConstantVector::get(Elts); } @@ -2574,7 +2492,7 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, if (LoadInst *LI = dyn_cast<LoadInst>(U)) { // Ignore non-volatile loads, they are always ok. - if (LI->isVolatile()) return false; + if (!LI->isSimple()) return false; continue; } diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index ad52417..fbb9465 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -338,16 +338,17 @@ struct StrCmpOpt : public LibCallOptimization { bool HasStr1 = GetConstantStringInfo(Str1P, Str1); bool HasStr2 = GetConstantStringInfo(Str2P, Str2); - if (HasStr1 && Str1.empty()) // strcmp("", x) -> *x - return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()); - - if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x - return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - // strcmp(x, y) -> cnst (if both x and y are constant strings) if (HasStr1 && HasStr2) return ConstantInt::get(CI->getType(), - strcmp(Str1.c_str(),Str2.c_str())); + StringRef(Str1).compare(Str2)); + + if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x + return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), + CI->getType())); + + if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x + return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); // strcmp(P, "x") -> memcmp(P, "x", 2) uint64_t Len1 = GetStringLength(Str1P); @@ -400,16 +401,20 @@ struct StrNCmpOpt : public LibCallOptimization { bool HasStr1 = GetConstantStringInfo(Str1P, Str1); bool HasStr2 = GetConstantStringInfo(Str2P, Str2); - if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> *x - return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()); + // strncmp(x, y) -> cnst (if both x and y are constant strings) + if (HasStr1 && HasStr2) { + StringRef SubStr1 = StringRef(Str1).substr(0, Length); + StringRef SubStr2 = StringRef(Str2).substr(0, Length); + return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2)); + } + + if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x + return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), + CI->getType())); if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - // strncmp(x, y) -> cnst (if both x and y are constant strings) - if (HasStr1 && HasStr2) - return ConstantInt::get(CI->getType(), - strncmp(Str1.c_str(), Str2.c_str(), Length)); return 0; } }; @@ -874,8 +879,8 @@ struct PowOpt : public LibCallOptimization { Callee->getAttributes()); Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes()); - Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf, "tmp"); - Value *Sel = B.CreateSelect(FCmp, Inf, FAbs, "tmp"); + Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf); + Value *Sel = B.CreateSelect(FCmp, Inf, FAbs); return Sel; } @@ -908,10 +913,10 @@ struct Exp2Opt : public LibCallOptimization { Value *LdExpArg = 0; if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) - LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty(), "tmp"); + LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) - LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty(), "tmp"); + LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); } if (LdExpArg) { @@ -996,10 +1001,10 @@ struct FFSOpt : public LibCallOptimization { Value *F = Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType); Value *V = B.CreateCall(F, Op, "cttz"); - V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1), "tmp"); - V = B.CreateIntCast(V, B.getInt32Ty(), false, "tmp"); + V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); + V = B.CreateIntCast(V, B.getInt32Ty(), false); - Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp"); + Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType)); return B.CreateSelect(Cond, V, B.getInt32(0)); } }; diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 705f442..c83f56c 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -153,9 +153,13 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, SmallPtrSet<Instruction *, 8> &Stores) { - if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { - if (L->isVolatile()) return false; + if (Inst->mayWriteToMemory()) { + Stores.insert(Inst); + return false; + } + + if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { AliasAnalysis::Location Loc = AA->getLocation(L); for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(), E = Stores.end(); I != E; ++I) @@ -163,11 +167,6 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, return false; } - if (Inst->mayWriteToMemory()) { - Stores.insert(Inst); - return false; - } - if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst)) return false; diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp deleted file mode 100644 index 9dd83c0..0000000 --- a/lib/Transforms/Scalar/TailDuplication.cpp +++ /dev/null @@ -1,373 +0,0 @@ -//===- TailDuplication.cpp - Simplify CFG through tail duplication --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass performs a limited form of tail duplication, intended to simplify -// CFGs by removing some unconditional branches. This pass is necessary to -// straighten out loops created by the C front-end, but also is capable of -// making other code nicer. After this pass is run, the CFG simplify pass -// should be run to clean up the mess. -// -// This pass could be enhanced in the future to use profile information to be -// more aggressive. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "tailduplicate" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Constant.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" -#include "llvm/Type.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Local.h" -#include <map> -using namespace llvm; - -STATISTIC(NumEliminated, "Number of unconditional branches eliminated"); - -static cl::opt<unsigned> -TailDupThreshold("taildup-threshold", - cl::desc("Max block size to tail duplicate"), - cl::init(1), cl::Hidden); - -namespace { - class TailDup : public FunctionPass { - bool runOnFunction(Function &F); - public: - static char ID; // Pass identification, replacement for typeid - TailDup() : FunctionPass(ID) { - initializeTailDupPass(*PassRegistry::getPassRegistry()); - } - - private: - inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned); - inline void eliminateUnconditionalBranch(BranchInst *BI); - SmallPtrSet<BasicBlock*, 4> CycleDetector; - }; -} - -char TailDup::ID = 0; -INITIALIZE_PASS(TailDup, "tailduplicate", "Tail Duplication", false, false) - -// Public interface to the Tail Duplication pass -FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); } - -/// runOnFunction - Top level algorithm - Loop over each unconditional branch in -/// the function, eliminating it if it looks attractive enough. CycleDetector -/// prevents infinite loops by checking that we aren't redirecting a branch to -/// a place it already pointed to earlier; see PR 2323. -bool TailDup::runOnFunction(Function &F) { - bool Changed = false; - CycleDetector.clear(); - for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { - if (shouldEliminateUnconditionalBranch(I->getTerminator(), - TailDupThreshold)) { - eliminateUnconditionalBranch(cast<BranchInst>(I->getTerminator())); - Changed = true; - } else { - ++I; - CycleDetector.clear(); - } - } - return Changed; -} - -/// shouldEliminateUnconditionalBranch - Return true if this branch looks -/// attractive to eliminate. We eliminate the branch if the destination basic -/// block has <= 5 instructions in it, not counting PHI nodes. In practice, -/// since one of these is a terminator instruction, this means that we will add -/// up to 4 instructions to the new block. -/// -/// We don't count PHI nodes in the count since they will be removed when the -/// contents of the block are copied over. -/// -bool TailDup::shouldEliminateUnconditionalBranch(TerminatorInst *TI, - unsigned Threshold) { - BranchInst *BI = dyn_cast<BranchInst>(TI); - if (!BI || !BI->isUnconditional()) return false; // Not an uncond branch! - - BasicBlock *Dest = BI->getSuccessor(0); - if (Dest == BI->getParent()) return false; // Do not loop infinitely! - - // Do not inline a block if we will just get another branch to the same block! - TerminatorInst *DTI = Dest->getTerminator(); - if (BranchInst *DBI = dyn_cast<BranchInst>(DTI)) - if (DBI->isUnconditional() && DBI->getSuccessor(0) == Dest) - return false; // Do not loop infinitely! - - // FIXME: DemoteRegToStack cannot yet demote invoke instructions to the stack, - // because doing so would require breaking critical edges. This should be - // fixed eventually. - if (!DTI->use_empty()) - return false; - - // Do not bother with blocks with only a single predecessor: simplify - // CFG will fold these two blocks together! - pred_iterator PI = pred_begin(Dest), PE = pred_end(Dest); - ++PI; - if (PI == PE) return false; // Exactly one predecessor! - - BasicBlock::iterator I = Dest->getFirstNonPHI(); - - for (unsigned Size = 0; I != Dest->end(); ++I) { - if (Size == Threshold) return false; // The block is too large. - - // Don't tail duplicate call instructions. They are very large compared to - // other instructions. - if (isa<CallInst>(I) || isa<InvokeInst>(I)) return false; - - // Also alloca and malloc. - if (isa<AllocaInst>(I)) return false; - - // Some vector instructions can expand into a number of instructions. - if (isa<ShuffleVectorInst>(I) || isa<ExtractElementInst>(I) || - isa<InsertElementInst>(I)) return false; - - // Only count instructions that are not debugger intrinsics. - if (!isa<DbgInfoIntrinsic>(I)) ++Size; - } - - // Do not tail duplicate a block that has thousands of successors into a block - // with a single successor if the block has many other predecessors. This can - // cause an N^2 explosion in CFG edges (and PHI node entries), as seen in - // cases that have a large number of indirect gotos. - unsigned NumSuccs = DTI->getNumSuccessors(); - if (NumSuccs > 8) { - unsigned TooMany = 128; - if (NumSuccs >= TooMany) return false; - TooMany = TooMany/NumSuccs; - for (; PI != PE; ++PI) - if (TooMany-- == 0) return false; - } - - // If this unconditional branch is a fall-through, be careful about - // tail duplicating it. In particular, we don't want to taildup it if the - // original block will still be there after taildup is completed: doing so - // would eliminate the fall-through, requiring unconditional branches. - Function::iterator DestI = Dest; - if (&*--DestI == BI->getParent()) { - // The uncond branch is a fall-through. Tail duplication of the block is - // will eliminate the fall-through-ness and end up cloning the terminator - // at the end of the Dest block. Since the original Dest block will - // continue to exist, this means that one or the other will not be able to - // fall through. One typical example that this helps with is code like: - // if (a) - // foo(); - // if (b) - // foo(); - // Cloning the 'if b' block into the end of the first foo block is messy. - - // The messy case is when the fall-through block falls through to other - // blocks. This is what we would be preventing if we cloned the block. - DestI = Dest; - if (++DestI != Dest->getParent()->end()) { - BasicBlock *DestSucc = DestI; - // If any of Dest's successors are fall-throughs, don't do this xform. - for (succ_iterator SI = succ_begin(Dest), SE = succ_end(Dest); - SI != SE; ++SI) - if (*SI == DestSucc) - return false; - } - } - - // Finally, check that we haven't redirected to this target block earlier; - // there are cases where we loop forever if we don't check this (PR 2323). - if (!CycleDetector.insert(Dest)) - return false; - - return true; -} - -/// FindObviousSharedDomOf - We know there is a branch from SrcBlock to -/// DestBlock, and that SrcBlock is not the only predecessor of DstBlock. If we -/// can find a predecessor of SrcBlock that is a dominator of both SrcBlock and -/// DstBlock, return it. -static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock, - BasicBlock *DstBlock) { - // SrcBlock must have a single predecessor. - pred_iterator PI = pred_begin(SrcBlock), PE = pred_end(SrcBlock); - if (PI == PE || ++PI != PE) return 0; - - BasicBlock *SrcPred = *pred_begin(SrcBlock); - - // Look at the predecessors of DstBlock. One of them will be SrcBlock. If - // there is only one other pred, get it, otherwise we can't handle it. - PI = pred_begin(DstBlock); PE = pred_end(DstBlock); - BasicBlock *DstOtherPred = 0; - BasicBlock *P = *PI; - if (P == SrcBlock) { - if (++PI == PE) return 0; - DstOtherPred = *PI; - if (++PI != PE) return 0; - } else { - DstOtherPred = P; - if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0; - } - - // We can handle two situations here: "if then" and "if then else" blocks. An - // 'if then' situation is just where DstOtherPred == SrcPred. - if (DstOtherPred == SrcPred) - return SrcPred; - - // Check to see if we have an "if then else" situation, which means that - // DstOtherPred will have a single predecessor and it will be SrcPred. - PI = pred_begin(DstOtherPred); PE = pred_end(DstOtherPred); - if (PI != PE && *PI == SrcPred) { - if (++PI != PE) return 0; // Not a single pred. - return SrcPred; // Otherwise, it's an "if then" situation. Return the if. - } - - // Otherwise, this is something we can't handle. - return 0; -} - - -/// eliminateUnconditionalBranch - Clone the instructions from the destination -/// block into the source block, eliminating the specified unconditional branch. -/// If the destination block defines values used by successors of the dest -/// block, we may need to insert PHI nodes. -/// -void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) { - BasicBlock *SourceBlock = Branch->getParent(); - BasicBlock *DestBlock = Branch->getSuccessor(0); - assert(SourceBlock != DestBlock && "Our predicate is broken!"); - - DEBUG(dbgs() << "TailDuplication[" << SourceBlock->getParent()->getName() - << "]: Eliminating branch: " << *Branch); - - // See if we can avoid duplicating code by moving it up to a dominator of both - // blocks. - if (BasicBlock *DomBlock = FindObviousSharedDomOf(SourceBlock, DestBlock)) { - DEBUG(dbgs() << "Found shared dominator: " << DomBlock->getName() << "\n"); - - // If there are non-phi instructions in DestBlock that have no operands - // defined in DestBlock, and if the instruction has no side effects, we can - // move the instruction to DomBlock instead of duplicating it. - BasicBlock::iterator BBI = DestBlock->getFirstNonPHI(); - while (!isa<TerminatorInst>(BBI)) { - Instruction *I = BBI++; - - bool CanHoist = I->isSafeToSpeculativelyExecute() && - !I->mayReadFromMemory(); - if (CanHoist) { - for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) - if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(op))) - if (OpI->getParent() == DestBlock || - (isa<InvokeInst>(OpI) && OpI->getParent() == DomBlock)) { - CanHoist = false; - break; - } - if (CanHoist) { - // Remove from DestBlock, move right before the term in DomBlock. - DestBlock->getInstList().remove(I); - DomBlock->getInstList().insert(DomBlock->getTerminator(), I); - DEBUG(dbgs() << "Hoisted: " << *I); - } - } - } - } - - // Tail duplication can not update SSA properties correctly if the values - // defined in the duplicated tail are used outside of the tail itself. For - // this reason, we spill all values that are used outside of the tail to the - // stack. - for (BasicBlock::iterator I = DestBlock->begin(); I != DestBlock->end(); ++I) - if (I->isUsedOutsideOfBlock(DestBlock)) { - // We found a use outside of the tail. Create a new stack slot to - // break this inter-block usage pattern. - DemoteRegToStack(*I); - } - - // We are going to have to map operands from the original block B to the new - // copy of the block B'. If there are PHI nodes in the DestBlock, these PHI - // nodes also define part of this mapping. Loop over these PHI nodes, adding - // them to our mapping. - // - std::map<Value*, Value*> ValueMapping; - - BasicBlock::iterator BI = DestBlock->begin(); - bool HadPHINodes = isa<PHINode>(BI); - for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) - ValueMapping[PN] = PN->getIncomingValueForBlock(SourceBlock); - - // Clone the non-phi instructions of the dest block into the source block, - // keeping track of the mapping... - // - for (; BI != DestBlock->end(); ++BI) { - Instruction *New = BI->clone(); - New->setName(BI->getName()); - SourceBlock->getInstList().push_back(New); - ValueMapping[BI] = New; - } - - // Now that we have built the mapping information and cloned all of the - // instructions (giving us a new terminator, among other things), walk the new - // instructions, rewriting references of old instructions to use new - // instructions. - // - BI = Branch; ++BI; // Get an iterator to the first new instruction - for (; BI != SourceBlock->end(); ++BI) - for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { - std::map<Value*, Value*>::const_iterator I = - ValueMapping.find(BI->getOperand(i)); - if (I != ValueMapping.end()) - BI->setOperand(i, I->second); - } - - // Next we check to see if any of the successors of DestBlock had PHI nodes. - // If so, we need to add entries to the PHI nodes for SourceBlock now. - for (succ_iterator SI = succ_begin(DestBlock), SE = succ_end(DestBlock); - SI != SE; ++SI) { - BasicBlock *Succ = *SI; - for (BasicBlock::iterator PNI = Succ->begin(); isa<PHINode>(PNI); ++PNI) { - PHINode *PN = cast<PHINode>(PNI); - // Ok, we have a PHI node. Figure out what the incoming value was for the - // DestBlock. - Value *IV = PN->getIncomingValueForBlock(DestBlock); - - // Remap the value if necessary... - std::map<Value*, Value*>::const_iterator I = ValueMapping.find(IV); - if (I != ValueMapping.end()) - IV = I->second; - PN->addIncoming(IV, SourceBlock); - } - } - - // Next, remove the old branch instruction, and any PHI node entries that we - // had. - BI = Branch; ++BI; // Get an iterator to the first new instruction - DestBlock->removePredecessor(SourceBlock); // Remove entries in PHI nodes... - SourceBlock->getInstList().erase(Branch); // Destroy the uncond branch... - - // Final step: now that we have finished everything up, walk the cloned - // instructions one last time, constant propagating and DCE'ing them, because - // they may not be needed anymore. - // - if (HadPHINodes) { - while (BI != SourceBlock->end()) { - Instruction *Inst = BI++; - if (isInstructionTriviallyDead(Inst)) - Inst->eraseFromParent(); - else if (Value *V = SimplifyInstruction(Inst)) { - Inst->replaceAllUsesWith(V); - Inst->eraseFromParent(); - } - } - } - - ++NumEliminated; // We just killed a branch! -} |