15 files changed, 387 insertions, 191 deletions
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index a8deda8..5912107 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Transforms/Utils/AddrModeMatcher.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -148,7 +149,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   PFI = getAnalysisIfAvailable<ProfileInfo>();
   OptSize = F.hasFnAttr(Attribute::OptimizeForSize);
 
-  // First pass, eliminate blocks that contain only PHI nodes and an
+  /// This optimization identifies DIV instructions that can be
+  /// profitably bypassed and carried out with a shorter, faster divide.
+  if (TLI && TLI->isSlowDivBypassed()) {
+    const DenseMap<Type *, Type *> &BypassTypeMap = TLI->getBypassSlowDivTypes();
+
+    for (Function::iterator I = F.begin(); I != F.end(); I++) {
+      EverMadeChange |= bypassSlowDivision(F,
+                                           I,
+                                           BypassTypeMap);
+    }
+  }
+
+  // Eliminate blocks that contain only PHI nodes and an
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
 
@@ -988,7 +1001,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     WeakVH IterHandle(CurInstIterator);
     BasicBlock *BB = CurInstIterator->getParent();
 
-    RecursivelyDeleteTriviallyDeadInstructions(Repl);
+    RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
 
     if (IterHandle != CurInstIterator) {
       // If the iterator instruction was recursively deleted, start over at the
@@ -1174,17 +1187,32 @@ static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
 }
 
 
+/// If we have a SelectInst that will likely profit from branch prediction,
+/// turn it into a branch.
 bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
-  // If we have a SelectInst that will likely profit from branch prediction,
-  // turn it into a branch.
-  if (DisableSelectToBranch || OptSize || !TLI ||
-      !TLI->isPredictableSelectExpensive())
-    return false;
+  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
 
-  if (!SI->getCondition()->getType()->isIntegerTy(1) ||
-      !isFormingBranchFromSelectProfitable(SI))
+  // Can we convert the 'select' to CF ?
+  if (DisableSelectToBranch || OptSize || !TLI || VectorCond)
     return false;
 
+  TargetLowering::SelectSupportKind SelectKind;
+  if (VectorCond)
+    SelectKind = TargetLowering::VectorMaskSelect;
+  else if (SI->getType()->isVectorTy())
+    SelectKind = TargetLowering::ScalarCondVectorVal;
+  else
+    SelectKind = TargetLowering::ScalarValSelect;
+
+  // Do we have efficient codegen support for this kind of 'selects' ?
+  if (TLI->isSelectSupported(SelectKind)) {
+    // We have efficient codegen support for the select instruction.
+    // Check if it is profitable to keep this 'select'.
+    if (!TLI->isPredictableSelectExpensive() ||
+        !isFormingBranchFromSelectProfitable(SI))
+      return false;
+  }
+
   ModifiedDT = true;
 
   // First, we split the block containing the select into 2 blocks.
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 8dbcc23..086f0a1 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Instruction.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
@@ -38,10 +39,11 @@ namespace {
       initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
     }
     virtual bool runOnBasicBlock(BasicBlock &BB) {
+      TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
         Instruction *Inst = DI++;
-        if (isInstructionTriviallyDead(Inst)) {
+        if (isInstructionTriviallyDead(Inst, TLI)) {
           Inst->eraseFromParent();
           Changed = true;
           ++DIEEliminated;
@@ -87,6 +89,8 @@ char DCE::ID = 0;
 INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
 
 bool DCE::runOnFunction(Function &F) {
+  TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
   // Start out with all of the instructions in the worklist...
   std::vector<Instruction*> WorkList;
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i)
@@ -101,7 +105,7 @@ bool DCE::runOnFunction(Function &F) {
     Instruction *I = WorkList.back();
     WorkList.pop_back();
 
-    if (isInstructionTriviallyDead(I)) {       // If the instruction is dead.
+    if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead.
       // Loop over all of the values that the instruction uses, if there are
       // instructions being used, add them to the worklist, because they might
       // go dead after this one is removed.
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 8b1283f..1ff4329 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -106,6 +106,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 ///
 static void DeleteDeadInstruction(Instruction *I,
                                   MemoryDependenceAnalysis &MD,
+                                  const TargetLibraryInfo *TLI,
                                   SmallSetVector<Value*, 16> *ValueSet = 0) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
@@ -130,7 +131,7 @@ static void DeleteDeadInstruction(Instruction *I,
       if (!Op->use_empty()) continue;
 
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           NowDeadInsts.push_back(OpI);
     }
 
@@ -276,7 +277,7 @@ static Value *getStoredPointerOperand(Instruction *I) {
 
 static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
   uint64_t Size;
-  if (getObjectSize(V, Size, AA.getTargetData()))
+  if (getObjectSize(V, Size, AA.getTargetData(), AA.getTargetLibraryInfo()))
     return Size;
   return AliasAnalysis::UnknownSize;
 }
@@ -454,7 +455,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     Instruction *Inst = BBI++;
 
     // Handle 'free' calls specially.
-    if (CallInst *F = isFreeCall(Inst)) {
+    if (CallInst *F = isFreeCall(Inst, AA->getTargetLibraryInfo())) {
       MadeChange |= HandleFree(F);
       continue;
     }
@@ -483,7 +484,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           // in case we need it.
           WeakVH NextInst(BBI);
 
-          DeleteDeadInstruction(SI, *MD);
+          DeleteDeadInstruction(SI, *MD, AA->getTargetLibraryInfo());
 
           if (NextInst == 0)  // Next instruction deleted.
             BBI = BB.begin();
@@ -530,7 +531,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
 
           // Delete the store and now-dead instructions that feed it.
-          DeleteDeadInstruction(DepWrite, *MD);
+          DeleteDeadInstruction(DepWrite, *MD, AA->getTargetLibraryInfo());
           ++NumFastStores;
           MadeChange = true;
 
@@ -640,7 +641,7 @@ bool DSE::HandleFree(CallInst *F) {
       Instruction *Next = llvm::next(BasicBlock::iterator(Dependency));
 
       // DCE instructions only used to calculate that store
-      DeleteDeadInstruction(Dependency, *MD);
+      DeleteDeadInstruction(Dependency, *MD, AA->getTargetLibraryInfo());
       ++NumFastStores;
       MadeChange = true;
 
@@ -680,7 +681,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
     // Okay, so these are dead heap objects, but if the pointer never escapes
     // then it's leaked by this function anyways.
-    else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true))
+    else if (isAllocLikeFn(I, AA->getTargetLibraryInfo()) &&
+             !PointerMayBeCaptured(I, true, true))
       DeadStackObjects.insert(I);
   }
 
@@ -724,7 +726,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
               dbgs() << '\n');
 
         // DCE instructions only used to calculate that store.
-        DeleteDeadInstruction(Dead, *MD, &DeadStackObjects);
+        DeleteDeadInstruction(Dead, *MD, AA->getTargetLibraryInfo(),
+                              &DeadStackObjects);
         ++NumFastStores;
         MadeChange = true;
         continue;
@@ -732,9 +735,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     }
 
     // Remove any dead non-memory-mutating instructions.
-    if (isInstructionTriviallyDead(BBI)) {
+    if (isInstructionTriviallyDead(BBI, AA->getTargetLibraryInfo())) {
       Instruction *Inst = BBI++;
-      DeleteDeadInstruction(Inst, *MD, &DeadStackObjects);
+      DeleteDeadInstruction(Inst, *MD, AA->getTargetLibraryInfo(),
+                            &DeadStackObjects);
       ++NumFastOther;
       MadeChange = true;
       continue;
@@ -750,7 +754,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     if (CallSite CS = cast<Value>(BBI)) {
       // Remove allocation function calls from the list of dead stack objects; 
       // there can't be any references before the definition.
-      if (isAllocLikeFn(BBI))
+      if (isAllocLikeFn(BBI, AA->getTargetLibraryInfo()))
         DeadStackObjects.remove(BBI);
 
       // If this call does not access memory, it can't be loading any of our
@@ -771,15 +775,15 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
           LiveAllocas.push_back(*I);
       }
 
-      for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
-           E = LiveAllocas.end(); I != E; ++I)
-        DeadStackObjects.remove(*I);
-
       // If all of the allocas were clobbered by the call then we're not going
       // to find anything else to process.
-      if (DeadStackObjects.empty())
+      if (DeadStackObjects.size() == LiveAllocas.size())
         break;
 
+      for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
+           E = LiveAllocas.end(); I != E; ++I)
+        DeadStackObjects.remove(*I);
+
       continue;
     }
 
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 9759549..2627113 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -374,7 +374,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     Instruction *Inst = I++;
 
     // Dead instructions should just be removed.
-    if (isInstructionTriviallyDead(Inst)) {
+    if (isInstructionTriviallyDead(Inst, TLI)) {
       DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
       Inst->eraseFromParent();
       Changed = true;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 4822fd0..16ae6ad 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -271,16 +271,16 @@ void ValueTable::add(Value *V, uint32_t num) {
   valueNumbering.insert(std::make_pair(V, num));
 }
 
-uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
+uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
     Expression exp = create_expression(C);
-    uint32_t& e = expressionNumbering[exp];
+    uint32_t &e = expressionNumbering[exp];
     if (!e) e = nextValueNumber++;
     valueNumbering[C] = e;
     return e;
   } else if (AA->onlyReadsMemory(C)) {
     Expression exp = create_expression(C);
-    uint32_t& e = expressionNumbering[exp];
+    uint32_t &e = expressionNumbering[exp];
     if (!e) {
       e = nextValueNumber++;
       valueNumbering[C] = e;
@@ -413,7 +413,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::And:
-    case Instruction::Or :
+    case Instruction::Or:
     case Instruction::Xor:
     case Instruction::ICmp:
     case Instruction::FCmp:
@@ -632,6 +632,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
 
+#ifndef NDEBUG
 void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
@@ -641,6 +642,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   }
   errs() << "}\n";
 }
+#endif
 
 /// IsValueFullyAvailableInBlock - Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
@@ -1436,7 +1438,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     Instruction *DepInst = DepInfo.getInst();
 
     // Loading the allocation -> undef.
-    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) ||
+    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
         // Loading immediately after lifetime begin -> undef.
         isLifetimeStart(DepInst)) {
       ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
@@ -1951,7 +1953,7 @@ bool GVN::processLoad(LoadInst *L) {
   // If this load really doesn't depend on anything, then we must be loading an
   // undef value.  This can happen when loading for a fresh allocation with no
   // intervening stores, for example.
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) {
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
     L->replaceAllUsesWith(UndefValue::get(L->getType()));
     markInstructionForDeletion(L);
     ++NumGVNLoad;
@@ -2231,12 +2233,20 @@ bool GVN::processInstruction(Instruction *I) {
     Value *SwitchCond = SI->getCondition();
     BasicBlock *Parent = SI->getParent();
     bool Changed = false;
+
+    // Remember how many outgoing edges there are to every successor.
+    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+    for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+      ++SwitchEdges[SI->getSuccessor(i)];
+
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
       BasicBlock *Dst = i.getCaseSuccessor();
-      BasicBlockEdge E(Parent, Dst);
-      if (E.isSingleEdge())
+      // If there is only a single edge, propagate the case value into it.
+      if (SwitchEdges.lookup(Dst) == 1) {
+        BasicBlockEdge E(Parent, Dst);
         Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E);
+      }
     }
     return Changed;
   }
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 37f8bdf..c933a17 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -68,6 +69,7 @@ namespace {
     ScalarEvolution *SE;
     DominatorTree   *DT;
     TargetData      *TD;
+    TargetLibraryInfo *TLI;
 
     SmallVector<WeakVH, 16> DeadInsts;
     bool Changed;
@@ -414,11 +416,11 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // new comparison.
   NewCompare->takeName(Compare);
   Compare->replaceAllUsesWith(NewCompare);
-  RecursivelyDeleteTriviallyDeadInstructions(Compare);
+  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
 
   // Delete the old floating point increment.
   Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
-  RecursivelyDeleteTriviallyDeadInstructions(Incr);
+  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
 
   // If the FP induction variable still has uses, this is because something else
   // in the loop uses its value.  In order to canonicalize the induction
@@ -431,7 +433,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
     Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
                                  PN->getParent()->getFirstInsertionPt());
     PN->replaceAllUsesWith(Conv);
-    RecursivelyDeleteTriviallyDeadInstructions(PN);
+    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
   }
   Changed = true;
 }
@@ -550,14 +552,14 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         PN->setIncomingValue(i, ExitVal);
 
         // If this instruction is dead now, delete it.
-        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+        RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
 
         if (NumPreds == 1) {
           // Completely replace a single-pred PHI. This is safe, because the
           // NewVal won't be variant in the loop, so we don't need an LCSSA phi
           // node anymore.
           PN->replaceAllUsesWith(ExitVal);
-          RecursivelyDeleteTriviallyDeadInstructions(PN);
+          RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
         }
       }
       if (NumPreds != 1) {
@@ -1697,6 +1699,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTree>();
   TD = getAnalysisIfAvailable<TargetData>();
+  TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   DeadInsts.clear();
   Changed = false;
@@ -1763,7 +1766,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   while (!DeadInsts.empty())
     if (Instruction *Inst =
           dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
-      RecursivelyDeleteTriviallyDeadInstructions(Inst);
+      RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
 
   // The Rewriter may not be used from this point on.
 
@@ -1772,7 +1775,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   SinkUnusedInvariants(L);
 
   // Clean up dead instructions.
-  Changed |= DeleteDeadPHIs(L->getHeader());
+  Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
   // Check a post-condition.
   assert(L->isLCSSAForm(*DT) &&
          "Indvars did not leave the loop in lcssa form!");
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index dd42c59..20844c6 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1455,7 +1455,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   // At this point, the IR is fully up to date and consistent.  Do a quick scan
   // over the new instructions and zap any that are constants or dead.  This
   // frequently happens because of phi translation.
-  SimplifyInstructionsInBlock(NewBB, TD);
+  SimplifyInstructionsInBlock(NewBB, TD, TLI);
 
   // Threaded an edge!
   ++NumThreads;
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 0192e92..99bedce 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -108,6 +108,9 @@ namespace {
     BasicBlock *Preheader;   // The preheader block of the current loop...
     Loop *CurLoop;           // The current loop we are working on...
     AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    bool MayThrow;           // The current loop contains an instruction which
+                             // may throw, thus preventing code motion of
+                             // instructions with side effects.
     DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
 
     /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
@@ -240,6 +243,15 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
       CurAST->add(*BB);                 // Incorporate the specified basic block
   }
 
+  MayThrow = false;
+  // TODO: We've already searched for instructions which may throw in subloops.
+  // We may want to reuse this information.
+  for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end();
+       (BB != BBE) && !MayThrow ; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+         (I != E) && !MayThrow; ++I)
+      MayThrow |= I->mayThrow();
+
   // We want to visit all of the instructions in this loop... that are not parts
   // of our subloops (they have already had their invariants hoisted out of
   // their loop, into this loop, so there is no need to process the BODIES of
@@ -307,7 +319,7 @@ void LICM::SinkRegion(DomTreeNode *N) {
 
     // If the instruction is dead, we would try to sink it because it isn't used
     // in the loop, instead, just delete it.
-    if (isInstructionTriviallyDead(&I)) {
+    if (isInstructionTriviallyDead(&I, TLI)) {
       DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
       ++II;
       CurAST->deleteValue(&I);
@@ -418,17 +430,22 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
       if (!FoundMod) return true;
     }
 
-    // FIXME: This should use mod/ref information to see if we can hoist or sink
-    // the call.
+    // FIXME: This should use mod/ref information to see if we can hoist or
+    // sink the call.
 
     return false;
   }
 
-  // Otherwise these instructions are hoistable/sinkable
-  return isa<BinaryOperator>(I) || isa<CastInst>(I) ||
-         isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
-         isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
-         isa<ShuffleVectorInst>(I);
+  // Only these instructions are hoistable/sinkable.
+  bool HoistableKind = (isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+                            isa<SelectInst>(I) || isa<GetElementPtrInst>(I) ||
+                            isa<CmpInst>(I)    || isa<InsertElementInst>(I) ||
+                            isa<ExtractElementInst>(I) ||
+                            isa<ShuffleVectorInst>(I));
+  if (!HoistableKind)
+      return false;
+
+  return isSafeToExecuteUnconditionally(I);
 }
 
 /// isNotUsedInLoop - Return true if the only users of this instruction are
@@ -604,6 +621,12 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
 }
 
 bool LICM::isGuaranteedToExecute(Instruction &Inst) {
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (MayThrow)
+    return false;
+
   // Otherwise we have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index ac1082c..a72e288 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -132,7 +132,8 @@ Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
 /// and zero out all the operands of this instruction.  If any of them become
 /// dead, delete them and the computation tree that feeds them.
 ///
-static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
+static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
+                                  const TargetLibraryInfo *TLI) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
@@ -153,7 +154,7 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
       if (!Op->use_empty()) continue;
 
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           NowDeadInsts.push_back(OpI);
     }
 
@@ -164,10 +165,11 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
 
 /// deleteIfDeadInstruction - If the specified value is a dead instruction,
 /// delete it and any recursively used instructions.
-static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) {
+static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
+                                    const TargetLibraryInfo *TLI) {
   if (Instruction *I = dyn_cast<Instruction>(V))
-    if (isInstructionTriviallyDead(I))
-      deleteDeadInstruction(I, SE);
+    if (isInstructionTriviallyDead(I, TLI))
+      deleteDeadInstruction(I, SE, TLI);
 }
 
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
@@ -490,7 +492,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                             StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(BasePtr, *SE);
+    deleteIfDeadInstruction(BasePtr, *SE, TLI);
     return false;
   }
 
@@ -538,7 +540,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(TheStore, *SE);
+  deleteDeadInstruction(TheStore, *SE, TLI);
   ++NumMemSet;
   return true;
 }
@@ -579,7 +581,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
                             getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
     return false;
   }
 
@@ -594,8 +596,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
                             StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(LoadBasePtr, *SE);
-    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    deleteIfDeadInstruction(LoadBasePtr, *SE, TLI);
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
     return false;
   }
 
@@ -628,7 +630,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(SI, *SE);
+  deleteDeadInstruction(SI, *SE, TLI);
   ++NumMemCpy;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 982400c..f5daa7b 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -120,7 +120,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
             ++NumSimplified;
           }
         }
-        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I);
+        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
 
         if (IsSubloopHeader && !isa<PHINode>(I))
           break;
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 7eeb152..abe07aa 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
@@ -256,6 +257,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
     return false;
 
   BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
 
   BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
   if (BI == 0 || BI->isUnconditional())
@@ -267,13 +269,9 @@ bool LoopRotate::rotateLoop(Loop *L) {
   if (!L->isLoopExiting(OrigHeader))
     return false;
 
-  // Updating PHInodes in loops with multiple exits adds complexity.
-  // Keep it simple, and restrict loop rotation to loops with one exit only.
-  // In future, lift this restriction and support for multiple exits if
-  // required.
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() > 1)
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (OrigLatch == 0 || L->isLoopExiting(OrigLatch))
     return false;
 
   // Check size of original header and reject loop if it is very big.
@@ -286,11 +284,10 @@ bool LoopRotate::rotateLoop(Loop *L) {
 
   // Now, this loop is suitable for rotation.
   BasicBlock *OrigPreheader = L->getLoopPreheader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
 
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
-  if (OrigPreheader == 0 || OrigLatch == 0)
+  if (OrigPreheader == 0)
     return false;
 
   // Anything ScalarEvolution may know about this loop or the PHI nodes
@@ -298,6 +295,8 @@ bool LoopRotate::rotateLoop(Loop *L) {
   if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>())
     SE->forgetLoop(L);
 
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
   // Find new Loop header. NewHeader is a Header's one and only successor
   // that is inside loop.  Header's other successor is outside the
   // loop.  Otherwise loop is not suitable for rotation.
@@ -408,10 +407,19 @@ bool LoopRotate::rotateLoop(Loop *L) {
     // Update DominatorTree to reflect the CFG change we just made.  Then split
     // edges as necessary to preserve LoopSimplify form.
     if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
-      // Since OrigPreheader now has the conditional branch to Exit block, it is
-      // the dominator of Exit.
-      DT->changeImmediateDominator(Exit, OrigPreheader);
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
+      // Everything that was dominated by the old loop header is now dominated
+      // by the original loop preheader. Conceptually the header was merged
+      // into the preheader, even though we reuse the actual block as a new
+      // loop latch.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
+        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+
+      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
 
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
@@ -440,6 +448,35 @@ bool LoopRotate::rotateLoop(Loop *L) {
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(NewHeader, OrigPreheader);
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
+
+      // Brute force incremental dominator tree update. Call
+      // findNearestCommonDominator on all CFG predecessors of each child of the
+      // original header.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      bool Changed;
+      do {
+        Changed = false;
+        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
+          DomTreeNode *Node = HeaderChildren[I];
+          BasicBlock *BB = Node->getBlock();
+
+          pred_iterator PI = pred_begin(BB);
+          BasicBlock *NearestDom = *PI;
+          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
+            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+
+          // Remember if this changes the DomTree.
+          if (Node->getIDom()->getBlock() != NearestDom) {
+            DT->changeImmediateDominator(BB, NearestDom);
+            Changed = true;
+          }
+        }
+
+      // If the dominator changed, this may have an effect on other
+      // predecessors, continue until we reach a fixpoint.
+      } while (Changed);
     }
   }
 
@@ -452,6 +489,8 @@ bool LoopRotate::rotateLoop(Loop *L) {
   // emitted code isn't too gross in this common case.
   MergeBlockIntoPredecessor(OrigHeader, this);
 
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
   ++NumRotated;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 0ae7a51..d7495da 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -121,9 +121,11 @@ void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
+#ifndef NDEBUG
 void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -414,9 +416,11 @@ void Formula::print(raw_ostream &OS) const {
   }
 }
 
+#ifndef NDEBUG
 void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 /// without changing its value.
@@ -974,9 +978,11 @@ void Cost::print(raw_ostream &OS) const {
     OS << ", plus " << SetupCost << " setup cost";
 }
 
+#ifndef NDEBUG
 void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -1060,9 +1066,11 @@ void LSRFixup::print(raw_ostream &OS) const {
     OS << ", Offset=" << Offset;
 }
 
+#ifndef NDEBUG
 void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -1252,9 +1260,11 @@ void LSRUse::print(raw_ostream &OS) const {
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
+#ifndef NDEBUG
 void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
 /// be completely folded into the user instruction at isel time. This includes
@@ -3436,9 +3446,11 @@ void WorkItem::print(raw_ostream &OS) const {
      << " , add offset " << Imm;
 }
 
+#ifndef NDEBUG
 void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
 /// distance apart and try to form reuse opportunities between them.
@@ -4731,9 +4743,11 @@ void LSRInstance::print(raw_ostream &OS) const {
   print_uses(OS);
 }
 
+#ifndef NDEBUG
 void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index 3222f20..dce8e8b 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -1236,16 +1236,19 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
 
   // An ObjC-Identified object can't alias a load if it is never locally stored.
   if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return isStoredObjCPointer(A);
     if (BIsIdentified) {
-      // If both pointers have provenance, they can be directly compared.
-      if (A != B)
-        return false;
-    } else {
-      if (isa<LoadInst>(B))
-        return isStoredObjCPointer(A);
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return isStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
     }
-  } else {
-    if (BIsIdentified && isa<LoadInst>(A))
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
       return isStoredObjCPointer(B);
   }
 
@@ -1381,9 +1384,6 @@ namespace {
   /// PtrState - This class summarizes several per-pointer runtime properties
   /// which are propogated through the flow graph.
   class PtrState {
-    /// NestCount - The known minimum level of retain+release nesting.
-    unsigned NestCount;
-
     /// KnownPositiveRefCount - True if the reference count is known to
     /// be incremented.
     bool KnownPositiveRefCount;
@@ -1401,7 +1401,7 @@ namespace {
     /// TODO: Encapsulate this better.
     RRInfo RRI;
 
-    PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false),
+    PtrState() : KnownPositiveRefCount(false), Partial(false),
                  Seq(S_None) {}
 
     void SetKnownPositiveRefCount() {
@@ -1416,18 +1416,6 @@ namespace {
       return KnownPositiveRefCount;
     }
 
-    void IncrementNestCount() {
-      if (NestCount != UINT_MAX) ++NestCount;
-    }
-
-    void DecrementNestCount() {
-      if (NestCount != 0) --NestCount;
-    }
-
-    bool IsKnownNested() const {
-      return NestCount > 0;
-    }
-
     void SetSeq(Sequence NewSeq) {
       Seq = NewSeq;
     }
@@ -1454,7 +1442,6 @@ void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
   KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
-  NestCount = std::min(NestCount, Other.NestCount);
 
   // We can't merge a plain objc_retain with an objc_retainBlock.
   if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
@@ -1868,6 +1855,26 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   return AutoreleaseCallee;
 }
 
+/// IsPotentialUse - Test whether the given value is possible a
+/// reference-counted pointer, including tests which utilize AliasAnalysis.
+static bool IsPotentialUse(const Value *Op, AliasAnalysis &AA) {
+  // First make the rudimentary check.
+  if (!IsPotentialUse(Op))
+    return false;
+
+  // Objects in constant memory are not reference-counted.
+  if (AA.pointsToConstantMemory(Op))
+    return false;
+
+  // Pointers in constant memory are not pointing to reference-counted objects.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
+    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
+      return false;
+
+  // Otherwise assume the worst.
+  return true;
+}
+
 /// CanAlterRefCount - Test whether the given instruction can result in a
 /// reference count modification (positive or negative) for the pointer's
 /// object.
@@ -1894,7 +1901,7 @@ CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
     for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
          I != E; ++I) {
       const Value *Op = *I;
-      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
@@ -1919,14 +1926,14 @@ CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
     // Comparing a pointer with null, or any other constant, isn't really a use,
     // because we don't care what the pointer points to, or about the values
     // of any other dynamic reference-counted pointers.
-    if (!IsPotentialUse(ICI->getOperand(1)))
+    if (!IsPotentialUse(ICI->getOperand(1), *PA.getAA()))
       return false;
   } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
     // For calls, just check the arguments (and not the callee operand).
     for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
          OE = CS.arg_end(); OI != OE; ++OI) {
       const Value *Op = *OI;
-      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
@@ -1936,14 +1943,14 @@ CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
     const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
     // If we can't tell what the underlying object was, assume there is a
     // dependence.
-    return IsPotentialUse(Op) && PA.related(Op, Ptr);
+    return IsPotentialUse(Op, *PA.getAA()) && PA.related(Op, Ptr);
   }
 
   // Check each operand for a match.
   for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
        OI != OE; ++OI) {
     const Value *Op = *OI;
-    if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+    if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
       return true;
   }
   return false;
@@ -2612,11 +2619,11 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
     S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
     S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
+    S.RRI.KnownSafe = S.IsKnownIncremented();
     S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
     S.RRI.Calls.insert(Inst);
 
-    S.IncrementNestCount();
+    S.SetKnownPositiveRefCount();
     break;
   }
   case IC_RetainBlock:
@@ -2631,7 +2638,6 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
 
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
     S.SetKnownPositiveRefCount();
-    S.DecrementNestCount();
 
     switch (S.GetSeq()) {
     case S_Stop:
@@ -2747,8 +2753,9 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
   // Merge the states from each successor to compute the initial state
   // for the current block.
-  for (BBState::edge_iterator SI(MyStates.succ_begin()),
-       SE(MyStates.succ_end()); SI != SE; ++SI) {
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
     const BasicBlock *Succ = *SI;
     DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
     assert(I != BBStates.end());
@@ -2760,7 +2767,6 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       assert(I != BBStates.end());
       MyStates.MergeSucc(I->second);
     }
-    break;
   }
 
   // Visit all the instructions, bottom-up.
@@ -2823,12 +2829,11 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
       S.ResetSequenceProgress(S_Retain);
       S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      // Don't check S.IsKnownIncremented() here because it's not sufficient.
-      S.RRI.KnownSafe = S.IsKnownNested();
+      S.RRI.KnownSafe = S.IsKnownIncremented();
       S.RRI.Calls.insert(Inst);
     }
 
-    S.IncrementNestCount();
+    S.SetKnownPositiveRefCount();
 
     // A retain can be a potential use; procede to the generic checking
     // code below.
@@ -2838,7 +2843,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.DecrementNestCount();
+    S.ClearRefCount();
 
     switch (S.GetSeq()) {
     case S_Retain:
@@ -2935,8 +2940,9 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
 
   // Merge the states from each predecessor to compute the initial state
   // for the current block.
-  for (BBState::edge_iterator PI(MyStates.pred_begin()),
-       PE(MyStates.pred_end()); PI != PE; ++PI) {
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
     const BasicBlock *Pred = *PI;
     DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
     assert(I != BBStates.end());
@@ -2948,7 +2954,6 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
       assert(I != BBStates.end());
       MyStates.MergePred(I->second);
     }
-    break;
   }
 
   // Visit all the instructions, top-down.
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index d13e4ab..6d27db1 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -59,9 +59,9 @@ FunctionPass *llvm::createCFGSimplificationPass() {
   return new CFGSimplifyPass();
 }
 
-/// ChangeToUnreachable - Insert an unreachable instruction before the specified
+/// changeToUnreachable - Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
-static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   BasicBlock *BB = I->getParent();
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
@@ -87,8 +87,8 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   }
 }
 
-/// ChangeToCall - Convert the specified invoke into a normal call.
-static void ChangeToCall(InvokeInst *II) {
+/// changeToCall - Convert the specified invoke into a normal call.
+static void changeToCall(InvokeInst *II) {
   SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
   NewCall->takeName(II);
@@ -105,7 +105,7 @@ static void ChangeToCall(InvokeInst *II) {
   II->eraseFromParent();
 }
 
-static bool MarkAliveBlocks(BasicBlock *BB,
+static bool markAliveBlocks(BasicBlock *BB,
                             SmallPtrSet<BasicBlock*, 128> &Reachable) {
 
   SmallVector<BasicBlock*, 128> Worklist;
@@ -129,7 +129,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
           ++BBI;
           if (!isa<UnreachableInst>(BBI)) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            ChangeToUnreachable(BBI, false);
+            changeToUnreachable(BBI, false);
             Changed = true;
           }
           break;
@@ -148,7 +148,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
         if (isa<UndefValue>(Ptr) ||
             (isa<ConstantPointerNull>(Ptr) &&
              SI->getPointerAddressSpace() == 0)) {
-          ChangeToUnreachable(SI, true);
+          changeToUnreachable(SI, true);
           Changed = true;
           break;
         }
@@ -159,7 +159,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
       Value *Callee = II->getCalledValue();
       if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        ChangeToUnreachable(II, true);
+        changeToUnreachable(II, true);
         Changed = true;
       } else if (II->doesNotThrow()) {
         if (II->use_empty() && II->onlyReadsMemory()) {
@@ -168,7 +168,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
           II->getUnwindDest()->removePredecessor(II->getParent());
           II->eraseFromParent();
         } else
-          ChangeToCall(II);
+          changeToCall(II);
         Changed = true;
       }
     }
@@ -180,12 +180,12 @@ static bool MarkAliveBlocks(BasicBlock *BB,
   return Changed;
 }
 
-/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise.
-static bool RemoveUnreachableBlocksFromFn(Function &F) {
+static bool removeUnreachableBlocksFromFn(Function &F) {
   SmallPtrSet<BasicBlock*, 128> Reachable;
-  bool Changed = MarkAliveBlocks(F.begin(), Reachable);
+  bool Changed = markAliveBlocks(F.begin(), Reachable);
 
   // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
@@ -215,9 +215,9 @@ static bool RemoveUnreachableBlocksFromFn(Function &F) {
   return true;
 }
 
-/// MergeEmptyReturnBlocks - If we have more than one empty (other than phi
+/// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
 /// node) return blocks, merge them together to promote recursive block merging.
-static bool MergeEmptyReturnBlocks(Function &F) {
+static bool mergeEmptyReturnBlocks(Function &F) {
   bool Changed = false;
 
   BasicBlock *RetBlock = 0;
@@ -291,9 +291,9 @@ static bool MergeEmptyReturnBlocks(Function &F) {
   return Changed;
 }
 
-/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function,
+/// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
-static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
+static bool iterativelySimplifyCFG(Function &F, const TargetData *TD) {
   bool Changed = false;
   bool LocalChange = true;
   while (LocalChange) {
@@ -317,24 +317,24 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
 //
 bool CFGSimplifyPass::runOnFunction(Function &F) {
   const TargetData *TD = getAnalysisIfAvailable<TargetData>();
-  bool EverChanged = RemoveUnreachableBlocksFromFn(F);
-  EverChanged |= MergeEmptyReturnBlocks(F);
-  EverChanged |= IterativeSimplifyCFG(F, TD);
+  bool EverChanged = removeUnreachableBlocksFromFn(F);
+  EverChanged |= mergeEmptyReturnBlocks(F);
+  EverChanged |= iterativelySimplifyCFG(F, TD);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
 
-  // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
-  // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // removeUnreachableBlocksFromFn is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
-  // avoid reruning IterativeSimplifyCFG if the second pass of
-  // RemoveUnreachableBlocksFromFn doesn't do anything.
-  if (!RemoveUnreachableBlocksFromFn(F))
+  // avoid reruning iterativelySimplifyCFG if the second pass of
+  // removeUnreachableBlocksFromFn doesn't do anything.
+  if (!removeUnreachableBlocksFromFn(F))
     return true;
 
   do {
-    EverChanged = IterativeSimplifyCFG(F, TD);
-    EverChanged |= RemoveUnreachableBlocksFromFn(F);
+    EverChanged = iterativelySimplifyCFG(F, TD);
+    EverChanged |= removeUnreachableBlocksFromFn(F);
   } while (EverChanged);
 
   return true;
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 3904419..65311fe 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
@@ -38,6 +39,10 @@ using namespace llvm;
 STATISTIC(NumSimplified, "Number of library calls simplified");
 STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
+static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("Enable unsafe double to float "
+                                            "shrinking for math lib calls"));
 //===----------------------------------------------------------------------===//
 // Optimizer Base Class
 //===----------------------------------------------------------------------===//
@@ -893,16 +898,56 @@ struct MemSetOpt : public LibCallOptimization {
 //===----------------------------------------------------------------------===//
 
 //===---------------------------------------===//
-// 'cos*' Optimizations
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+
+struct UnaryDoubleFPOpt : public LibCallOptimization {
+  bool CheckRetType;
+  UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
+        !FT->getParamType(0)->isDoubleTy())
+      return 0;
+
+    if (CheckRetType) {
+      // Check if all the uses for function like 'sin' are converted to float.
+      for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end();
+          ++UseI) {
+        FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI);
+        if (Cast == 0 || !Cast->getType()->isFloatTy())
+          return 0;
+      }
+    }
+
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
+    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
+      return 0;
+
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+    return B.CreateFPExt(V, B.getDoubleTy());
+  }
+};
 
+//===---------------------------------------===//
+// 'cos*' Optimizations
 struct CosOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "cos" &&
+        TLI->has(LibFunc::cosf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     // cos(-x) -> cos(x)
     Value *Op1 = CI->getArgOperand(0);
@@ -910,7 +955,7 @@ struct CosOpt : public LibCallOptimization {
       BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
       return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
     }
-    return 0;
+    return Ret;
   }
 };
 
@@ -919,13 +964,20 @@ struct CosOpt : public LibCallOptimization {
 
 struct PowOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "pow" &&
+        TLI->has(LibFunc::powf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
     if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
@@ -936,7 +988,7 @@ struct PowOpt : public LibCallOptimization {
     }
 
     ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
-    if (Op2C == 0) return 0;
+    if (Op2C == 0) return Ret;
 
     if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
       return ConstantFP::get(CI->getType(), 1.0);
@@ -974,12 +1026,19 @@ struct PowOpt : public LibCallOptimization {
 
 struct Exp2Opt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "exp2" &&
+        TLI->has(LibFunc::exp2)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     Value *Op = CI->getArgOperand(0);
     // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
@@ -1016,29 +1075,7 @@ struct Exp2Opt : public LibCallOptimization {
 
       return CI;
     }
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
-
-struct UnaryDoubleFPOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
-        !FT->getParamType(0)->isDoubleTy())
-      return 0;
-
-    // If this is something like 'floor((double)floatval)', convert to floorf.
-    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
-      return 0;
-
-    // floor((double)floatval) -> (double)floorf(floatval)
-    Value *V = Cast->getOperand(0);
-    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
-    return B.CreateFPExt(V, B.getDoubleTy());
+    return Ret;
   }
 };
 
@@ -1534,7 +1571,8 @@ namespace {
     StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr;
     MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
-    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
+    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
+    UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP;
     // Integer Optimizations
     FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii;
     ToAsciiOpt ToAscii;
@@ -1547,10 +1585,13 @@ namespace {
   public:
     static char ID; // Pass identification
     SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true),
-                         StpCpy(false), StpCpyChk(true) {
+                         StpCpy(false), StpCpyChk(true),
+                         UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
     void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
+    void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt);
+
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
@@ -1586,6 +1627,12 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) {
     Optimizations[TLI->getName(F)] = Opt;
 }
 
+void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
+                              LibCallOptimization* Opt) {
+  if (TLI->has(F1) && TLI->has(F2))
+    Optimizations[TLI->getName(F1)] = Opt;
+}
+
 /// Optimizations - Populate the Optimizations map with all the optimizations
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
@@ -1641,20 +1688,37 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["llvm.exp2.f64"] = &Exp2;
   Optimizations["llvm.exp2.f32"] = &Exp2;
 
-  if (TLI->has(LibFunc::fabs) && TLI->has(LibFunc::fabsf))
-    Optimizations["fabs"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::floor) && TLI->has(LibFunc::floorf))
-    Optimizations["floor"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::ceil) && TLI->has(LibFunc::ceilf))
-    Optimizations["ceil"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::round) && TLI->has(LibFunc::roundf))
-    Optimizations["round"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::rint) && TLI->has(LibFunc::rintf))
-    Optimizations["rint"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::nearbyint) && TLI->has(LibFunc::nearbyintf))
-    Optimizations["nearbyint"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::trunc) && TLI->has(LibFunc::truncf))
-    Optimizations["trunc"] = &UnaryDoubleFP;
+  AddOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP);
+  AddOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP);
+  AddOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP);
+  AddOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP);
+  AddOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP);
+  AddOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP);
+  AddOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP);
+
+  if(UnsafeFPShrink) {
+    AddOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP);
+  }
 
   // Integer Optimizations
   Optimizations["ffs"] = &FFS;