24 files changed, 1805 insertions, 1650 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index a5adb5e..ba214d1 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -57,6 +57,7 @@ bool ADCE::runOnFunction(Function& F) {
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
     if (isa<TerminatorInst>(I.getInstructionIterator()) ||
         isa<DbgInfoIntrinsic>(I.getInstructionIterator()) ||
+        isa<LandingPadInst>(I.getInstructionIterator()) ||
         I->mayHaveSideEffects()) {
       alive.insert(I.getInstructionIterator());
       worklist.push_back(I.getInstructionIterator());
@@ -65,7 +66,6 @@ bool ADCE::runOnFunction(Function& F) {
   // Propagate liveness backwards to operands.
   while (!worklist.empty()) {
     Instruction* curr = worklist.pop_back_val();
-    
     for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
          OI != OE; ++OI)
       if (Instruction* Inst = dyn_cast<Instruction>(OI))
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index c223da6..a6f0cf3 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_library(LLVMScalarOpts
   DCE.cpp
   DeadStoreElimination.cpp
   EarlyCSE.cpp
+  GlobalMerge.cpp
   GVN.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
@@ -29,6 +30,14 @@ add_llvm_library(LLVMScalarOpts
   SimplifyCFGPass.cpp
   SimplifyLibCalls.cpp
   Sink.cpp
-  TailDuplication.cpp
   TailRecursionElimination.cpp
   )
+
+add_llvm_library_dependencies(LLVMScalarOpts
+  LLVMAnalysis
+  LLVMCore
+  LLVMInstCombine
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  )
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 17beeb5..f8f18b2 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -58,6 +58,7 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
+STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
@@ -110,6 +111,7 @@ namespace {
     bool MoveExtToFormExtLoad(Instruction *I);
     bool OptimizeExtUses(Instruction *I);
     bool DupRetToEnableTailCallOpts(ReturnInst *RI);
+    bool PlaceDbgValues(Function &F);
   };
 }
 
@@ -132,6 +134,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
 
+  // llvm.dbg.value is far away from the value then iSel may not be able
+  // handle it properly. iSel will drop llvm.dbg.value if it can not 
+  // find a node corresponding to the value.
+  EverMadeChange |= PlaceDbgValues(F);
+
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
@@ -410,8 +417,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
     CastInst *&InsertedCast = InsertedCasts[UserBB];
 
     if (!InsertedCast) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
-
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       InsertedCast =
         CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "",
                          InsertPt);
@@ -467,8 +473,7 @@ static bool OptimizeCmpExpression(CmpInst *CI) {
     CmpInst *&InsertedCmp = InsertedCmps[UserBB];
 
     if (!InsertedCmp) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
-
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       InsertedCmp =
         CmpInst::Create(CI->getOpcode(),
                         CI->getPredicate(),  CI->getOperand(0),
@@ -551,22 +556,6 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   // From here on out we're working with named functions.
   if (CI->getCalledFunction() == 0) return false;
 
-  // llvm.dbg.value is far away from the value then iSel may not be able
-  // handle it properly. iSel will drop llvm.dbg.value if it can not 
-  // find a node corresponding to the value.
-  if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(CI))
-    if (Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()))
-      if (!VI->isTerminator() &&
-          (DVI->getParent() != VI->getParent() || DT->dominates(DVI, VI))) {
-        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
-        DVI->removeFromParent();
-        if (isa<PHINode>(VI))
-          DVI->insertBefore(VI->getParent()->getFirstNonPHI());
-        else
-          DVI->insertAfter(VI);
-        return true;
-      }
-
   // We'll need TargetData from here on out.
   const TargetData *TD = TLI ? TLI->getTargetData() : 0;
   if (!TD) return false;
@@ -746,13 +735,11 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     worklist.pop_back();
     
     // Break use-def graph loops.
-    if (Visited.count(V)) {
+    if (!Visited.insert(V)) {
       Consensus = 0;
       break;
     }
     
-    Visited.insert(V);
-    
     // For a PHI node, push all of its incoming values.
     if (PHINode *P = dyn_cast<PHINode>(V)) {
       for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i)
@@ -763,7 +750,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // For non-PHIs, determine the addressing mode being computed.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode =
-      AddressingModeMatcher::Match(V, AccessTy,MemoryInst,
+      AddressingModeMatcher::Match(V, AccessTy, MemoryInst,
                                    NewAddrModeInsts, *TLI);
 
     // This check is broken into two cases with very similar code to avoid using
@@ -822,7 +809,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // Insert this computation right after this user.  Since our caller is
   // scanning from the top of the BB to the bottom, reuse of the expr are
   // guaranteed to happen later.
-  BasicBlock::iterator InsertPt = MemoryInst;
+  IRBuilder<> Builder(MemoryInst);
 
   // Now that we determined the addressing expression we want to use and know
   // that we have to sink it into this block.  Check to see if we have already
@@ -833,7 +820,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
     if (SunkAddr->getType() != Addr->getType())
-      SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt);
+      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
@@ -850,10 +837,9 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (AddrMode.BaseReg) {
       Value *V = AddrMode.BaseReg;
       if (V->getType()->isPointerTy())
-        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
       if (V->getType() != IntPtrTy)
-        V = CastInst::CreateIntegerCast(V, IntPtrTy, /*isSigned=*/true,
-                                        "sunkaddr", InsertPt);
+        V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
       Result = V;
     }
 
@@ -863,29 +849,27 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       if (V->getType() == IntPtrTy) {
         // done.
       } else if (V->getType()->isPointerTy()) {
-        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
       } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
                  cast<IntegerType>(V->getType())->getBitWidth()) {
-        V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
       } else {
-        V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr");
       }
       if (AddrMode.Scale != 1)
-        V = BinaryOperator::CreateMul(V, ConstantInt::get(IntPtrTy,
-                                                                AddrMode.Scale),
-                                      "sunkaddr", InsertPt);
+        V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
+                              "sunkaddr");
       if (Result)
-        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
         Result = V;
     }
 
     // Add in the BaseGV if present.
     if (AddrMode.BaseGV) {
-      Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr",
-                                  InsertPt);
+      Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
       if (Result)
-        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
         Result = V;
     }
@@ -894,7 +878,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (AddrMode.BaseOffs) {
       Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
       if (Result)
-        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
         Result = V;
     }
@@ -902,7 +886,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (Result == 0)
       SunkAddr = Constant::getNullValue(Addr->getType());
     else
-      SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt);
+      SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
   }
 
   MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
@@ -1059,8 +1043,7 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
     Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
 
     if (!InsertedTrunc) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
-
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
     }
 
@@ -1159,3 +1142,34 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
 
   return MadeChange;
 }
+
+// llvm.dbg.value is far away from the value then iSel may not be able
+// handle it properly. iSel will drop llvm.dbg.value if it can not 
+// find a node corresponding to the value.
+bool CodeGenPrepare::PlaceDbgValues(Function &F) {
+  bool MadeChange = false;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    Instruction *PrevNonDbgInst = NULL;
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      Instruction *Insn = BI; ++BI;
+      DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
+      if (!DVI) {
+        PrevNonDbgInst = Insn;
+        continue;
+      }
+
+      Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
+      if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
+        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
+        DVI->removeFromParent();
+        if (isa<PHINode>(VI))
+          DVI->insertBefore(VI->getParent()->getFirstInsertionPt());
+        else
+          DVI->insertAfter(VI);
+        MadeChange = true;
+        ++NumDbgValueMoved;
+      }
+    }
+  }
+  return MadeChange;
+}
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index e6089a9..a593d0f 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -52,18 +52,18 @@ namespace {
       AA = &getAnalysis<AliasAnalysis>();
       MD = &getAnalysis<MemoryDependenceAnalysis>();
       DominatorTree &DT = getAnalysis<DominatorTree>();
-      
+
       bool Changed = false;
       for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
         // Only check non-dead blocks.  Dead blocks may have strange pointer
         // cycles that will confuse alias analysis.
         if (DT.isReachableFromEntry(I))
           Changed |= runOnBasicBlock(*I);
-      
+
       AA = 0; MD = 0;
       return Changed;
     }
-    
+
     bool runOnBasicBlock(BasicBlock &BB);
     bool HandleFree(CallInst *F);
     bool handleEndBlock(BasicBlock &BB);
@@ -105,34 +105,34 @@ static void DeleteDeadInstruction(Instruction *I,
                                   MemoryDependenceAnalysis &MD,
                                   SmallPtrSet<Value*, 16> *ValueSet = 0) {
   SmallVector<Instruction*, 32> NowDeadInsts;
-  
+
   NowDeadInsts.push_back(I);
   --NumFastOther;
-  
+
   // Before we touch this instruction, remove it from memdep!
   do {
     Instruction *DeadInst = NowDeadInsts.pop_back_val();
     ++NumFastOther;
-    
+
     // This instruction is dead, zap it, in stages.  Start by removing it from
     // MemDep, which needs to know the operands and needs it to be in the
     // function.
     MD.removeInstruction(DeadInst);
-    
+
     for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
       Value *Op = DeadInst->getOperand(op);
       DeadInst->setOperand(op, 0);
-      
+
       // If this operand just became dead, add it to the NowDeadInsts list.
       if (!Op->use_empty()) continue;
-      
+
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
         if (isInstructionTriviallyDead(OpI))
           NowDeadInsts.push_back(OpI);
     }
-    
+
     DeadInst->eraseFromParent();
-    
+
     if (ValueSet) ValueSet->erase(DeadInst);
   } while (!NowDeadInsts.empty());
 }
@@ -159,11 +159,13 @@ static bool hasMemoryWrite(Instruction *I) {
 }
 
 /// getLocForWrite - Return a Location stored to by the specified instruction.
+/// If isRemovable returns true, this function and getLocForRead completely
+/// describe the memory operations for this instruction.
 static AliasAnalysis::Location
 getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
     return AA.getLocation(SI);
-  
+
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
     // memcpy/memmove/memset.
     AliasAnalysis::Location Loc = AA.getLocationForDest(MI);
@@ -174,10 +176,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
       return AliasAnalysis::Location();
     return Loc;
   }
-  
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
   if (II == 0) return AliasAnalysis::Location();
-  
+
   switch (II->getIntrinsicID()) {
   default: return AliasAnalysis::Location(); // Unhandled intrinsic.
   case Intrinsic::init_trampoline:
@@ -185,7 +187,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
     // that we should use the size of the pointee type.  This isn't valid for
     // init.trampoline, which writes more than an i8.
     if (AA.getTargetData() == 0) return AliasAnalysis::Location();
-      
+
     // FIXME: We don't know the size of the trampoline, so we can't really
     // handle it here.
     return AliasAnalysis::Location(II->getArgOperand(0));
@@ -198,10 +200,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
 
 /// getLocForRead - Return the location read by the specified "hasMemoryWrite"
 /// instruction if any.
-static AliasAnalysis::Location 
+static AliasAnalysis::Location
 getLocForRead(Instruction *Inst, AliasAnalysis &AA) {
   assert(hasMemoryWrite(Inst) && "Unknown instruction case");
-  
+
   // The only instructions that both read and write are the mem transfer
   // instructions (memcpy/memmove).
   if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
@@ -213,10 +215,10 @@ getLocForRead(Instruction *Inst, AliasAnalysis &AA) {
 /// isRemovable - If the value of this instruction and the memory it writes to
 /// is unused, may we delete this instruction?
 static bool isRemovable(Instruction *I) {
-  // Don't remove volatile stores.
+  // Don't remove volatile/atomic stores.
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return !SI->isVolatile();
-  
+    return SI->isUnordered();
+
   IntrinsicInst *II = cast<IntrinsicInst>(I);
   switch (II->getIntrinsicID()) {
   default: assert(0 && "doesn't pass 'hasMemoryWrite' predicate");
@@ -227,7 +229,7 @@ static bool isRemovable(Instruction *I) {
   case Intrinsic::init_trampoline:
     // Always safe to remove init_trampoline.
     return true;
-    
+
   case Intrinsic::memset:
   case Intrinsic::memmove:
   case Intrinsic::memcpy:
@@ -255,14 +257,14 @@ static uint64_t getPointerSize(Value *V, AliasAnalysis &AA) {
   const TargetData *TD = AA.getTargetData();
   if (TD == 0)
     return AliasAnalysis::UnknownSize;
-  
+
   if (AllocaInst *A = dyn_cast<AllocaInst>(V)) {
     // Get size information for the alloca
     if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize()))
       return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType());
     return AliasAnalysis::UnknownSize;
   }
-  
+
   assert(isa<Argument>(V) && "Expected AllocaInst or Argument!");
   PointerType *PT = cast<PointerType>(V->getType());
   return TD->getTypeAllocSize(PT->getElementType());
@@ -287,7 +289,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
                                 AliasAnalysis &AA) {
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
-  
+
   // If the start pointers are the same, we just have to compare sizes to see if
   // the later store was larger than the earlier store.
   if (P1 == P2) {
@@ -302,33 +304,33 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
         return Later.Ptr->getType() == Earlier.Ptr->getType();
       return false;
     }
-    
+
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size < Earlier.Size)
       return false;
     return true;
   }
-  
+
   // Otherwise, we have to have size information, and the later store has to be
   // larger than the earlier one.
   if (Later.Size == AliasAnalysis::UnknownSize ||
       Earlier.Size == AliasAnalysis::UnknownSize ||
       Later.Size <= Earlier.Size || AA.getTargetData() == 0)
     return false;
-  
+
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval argument).  If so, then it clearly overwrites any
   // other store to the same object.
   const TargetData &TD = *AA.getTargetData();
-  
+
   const Value *UO1 = GetUnderlyingObject(P1, &TD),
               *UO2 = GetUnderlyingObject(P2, &TD);
-  
+
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
   if (UO1 != UO2)
     return false;
-  
+
   // If the "Later" store is to a recognizable object, get its size.
   if (isObjectPointerWithTrustworthySize(UO2)) {
     uint64_t ObjectSize =
@@ -336,26 +338,26 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
     if (ObjectSize == Later.Size)
       return true;
   }
-  
+
   // Okay, we have stores to two completely different pointers.  Try to
   // decompose the pointer into a "base + constant_offset" form.  If the base
   // pointers are equal, then we can reason about the two stores.
   int64_t EarlierOff = 0, LaterOff = 0;
   const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD);
   const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD);
-  
+
   // If the base pointers still differ, we have two completely different stores.
   if (BP1 != BP2)
     return false;
 
   // The later store completely overlaps the earlier store if:
-  // 
+  //
   // 1. Both start at the same offset and the later one's size is greater than
   //    or equal to the earlier one's, or
   //
   //      |--earlier--|
   //      |--   later   --|
-  //      
+  //
   // 2. The earlier store has an offset greater than the later offset, but which
   //    still lies completely within the later store.
   //
@@ -373,7 +375,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
 
 /// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a
 /// memory region into an identical pointer) then it doesn't actually make its
-/// input dead in the traditional sense.  Consider this case: 
+/// input dead in the traditional sense.  Consider this case:
 ///
 ///   memcpy(A <- B)
 ///   memcpy(A <- A)
@@ -391,10 +393,10 @@ static bool isPossibleSelfRead(Instruction *Inst,
   // location read.
   AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA);
   if (InstReadLoc.Ptr == 0) return false;  // Not a reading instruction.
-  
+
   // If the read and written loc obviously don't alias, it isn't a read.
   if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
-  
+
   // Okay, 'Inst' may copy over itself.  However, we can still remove a the
   // DepWrite instruction if we can prove that it reads from the same location
   // as Inst.  This handles useful cases like:
@@ -404,10 +406,10 @@ static bool isPossibleSelfRead(Instruction *Inst,
   // aliases, so removing the first memcpy is safe (assuming it writes <= #
   // bytes as the second one.
   AliasAnalysis::Location DepReadLoc = getLocForRead(DepWrite, AA);
-  
+
   if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
     return false;
-  
+
   // If DepWrite doesn't read memory or if we can't prove it is a must alias,
   // then it can't be considered dead.
   return true;
@@ -420,43 +422,43 @@ static bool isPossibleSelfRead(Instruction *Inst,
 
 bool DSE::runOnBasicBlock(BasicBlock &BB) {
   bool MadeChange = false;
-  
+
   // Do a top-down walk on the BB.
   for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
     Instruction *Inst = BBI++;
-    
+
     // Handle 'free' calls specially.
     if (CallInst *F = isFreeCall(Inst)) {
       MadeChange |= HandleFree(F);
       continue;
     }
-    
+
     // If we find something that writes memory, get its memory dependence.
     if (!hasMemoryWrite(Inst))
       continue;
 
     MemDepResult InstDep = MD->getDependency(Inst);
-    
+
     // Ignore any store where we can't find a local dependence.
     // FIXME: cross-block DSE would be fun. :)
-    if (InstDep.isNonLocal() || InstDep.isUnknown())
+    if (!InstDep.isDef() && !InstDep.isClobber())
       continue;
-     
+
     // If we're storing the same value back to a pointer that we just
     // loaded from, then the store can be removed.
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) {
         if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
-            SI->getOperand(0) == DepLoad && !SI->isVolatile()) {
+            SI->getOperand(0) == DepLoad && isRemovable(SI)) {
           DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  "
                        << "LOAD: " << *DepLoad << "\n  STORE: " << *SI << '\n');
-          
+
           // DeleteDeadInstruction can delete the current instruction.  Save BBI
           // in case we need it.
           WeakVH NextInst(BBI);
-          
+
           DeleteDeadInstruction(SI, *MD);
-          
+
           if (NextInst == 0)  // Next instruction deleted.
             BBI = BB.begin();
           else if (BBI != BB.begin())  // Revisit this instruction if possible.
@@ -467,15 +469,15 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
         }
       }
     }
-    
+
     // Figure out what location is being stored to.
     AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA);
 
     // If we didn't get a useful location, fail.
     if (Loc.Ptr == 0)
       continue;
-    
-    while (!InstDep.isNonLocal() && !InstDep.isUnknown()) {
+
+    while (InstDep.isDef() || InstDep.isClobber()) {
       // Get the memory clobbered by the instruction we depend on.  MemDep will
       // skip any instructions that 'Loc' clearly doesn't interact with.  If we
       // end up depending on a may- or must-aliased load, then we can't optimize
@@ -496,12 +498,12 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) {
         DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
               << *DepWrite << "\n  KILLER: " << *Inst << '\n');
-        
+
         // Delete the store and now-dead instructions that feed it.
         DeleteDeadInstruction(DepWrite, *MD);
         ++NumFastStores;
         MadeChange = true;
-        
+
         // DeleteDeadInstruction can delete the current instruction in loop
         // cases, reset BBI.
         BBI = Inst;
@@ -509,7 +511,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           --BBI;
         break;
       }
-      
+
       // If this is a may-aliased store that is clobbering the store value, we
       // can keep searching past it for another must-aliased pointer that stores
       // to the same location.  For example, in:
@@ -519,20 +521,20 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       // we can remove the first store to P even though we don't know if P and Q
       // alias.
       if (DepWrite == &BB.front()) break;
-      
+
       // Can't look past this instruction if it might read 'Loc'.
       if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref)
         break;
-        
+
       InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB);
     }
   }
-  
+
   // If this block ends in a return, unwind, or unreachable, all allocas are
   // dead at its end, which means stores to them are also dead.
   if (BB.getTerminator()->getNumSuccessors() == 0)
     MadeChange |= handleEndBlock(BB);
-  
+
   return MadeChange;
 }
 
@@ -543,18 +545,18 @@ bool DSE::HandleFree(CallInst *F) {
 
   MemDepResult Dep = MD->getDependency(F);
 
-  while (!Dep.isNonLocal() && !Dep.isUnknown()) {
+  while (Dep.isDef() || Dep.isClobber()) {
     Instruction *Dependency = Dep.getInst();
     if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency))
       return MadeChange;
-  
+
     Value *DepPointer =
       GetUnderlyingObject(getStoredPointerOperand(Dependency));
 
     // Check for aliasing.
     if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
       return MadeChange;
-  
+
     // DCE instructions only used to calculate that store
     DeleteDeadInstruction(Dependency, *MD);
     ++NumFastStores;
@@ -567,7 +569,7 @@ bool DSE::HandleFree(CallInst *F) {
     //    free(s);
     Dep = MD->getDependency(F);
   };
-  
+
   return MadeChange;
 }
 
@@ -579,28 +581,28 @@ bool DSE::HandleFree(CallInst *F) {
 /// ret void
 bool DSE::handleEndBlock(BasicBlock &BB) {
   bool MadeChange = false;
-  
+
   // Keep track of all of the stack objects that are dead at the end of the
   // function.
   SmallPtrSet<Value*, 16> DeadStackObjects;
-  
+
   // Find all of the alloca'd pointers in the entry block.
   BasicBlock *Entry = BB.getParent()->begin();
   for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I)
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
       DeadStackObjects.insert(AI);
-  
+
   // Treat byval arguments the same, stores to them are dead at the end of the
   // function.
   for (Function::arg_iterator AI = BB.getParent()->arg_begin(),
        AE = BB.getParent()->arg_end(); AI != AE; ++AI)
     if (AI->hasByValAttr())
       DeadStackObjects.insert(AI);
-  
+
   // Scan the basic block backwards
   for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
     --BBI;
-    
+
     // If we find a store, check to see if it points into a dead stack value.
     if (hasMemoryWrite(BBI) && isRemovable(BBI)) {
       // See through pointer-to-pointer bitcasts
@@ -609,10 +611,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       // Stores to stack values are valid candidates for removal.
       if (DeadStackObjects.count(Pointer)) {
         Instruction *Dead = BBI++;
-        
+
         DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
                      << *Dead << "\n  Object: " << *Pointer << '\n');
-        
+
         // DCE instructions only used to calculate that store.
         DeleteDeadInstruction(Dead, *MD, &DeadStackObjects);
         ++NumFastStores;
@@ -620,7 +622,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
         continue;
       }
     }
-    
+
     // Remove any dead non-memory-mutating instructions.
     if (isInstructionTriviallyDead(BBI)) {
       Instruction *Inst = BBI++;
@@ -629,55 +631,61 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       MadeChange = true;
       continue;
     }
-    
+
     if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) {
       DeadStackObjects.erase(A);
       continue;
     }
-    
+
     if (CallSite CS = cast<Value>(BBI)) {
       // If this call does not access memory, it can't be loading any of our
       // pointers.
       if (AA->doesNotAccessMemory(CS))
         continue;
-      
+
       // If the call might load from any of our allocas, then any store above
       // the call is live.
       SmallVector<Value*, 8> LiveAllocas;
       for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
            E = DeadStackObjects.end(); I != E; ++I) {
         // See if the call site touches it.
-        AliasAnalysis::ModRefResult A = 
+        AliasAnalysis::ModRefResult A =
           AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA));
-        
+
         if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref)
           LiveAllocas.push_back(*I);
       }
-      
+
       for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
            E = LiveAllocas.end(); I != E; ++I)
         DeadStackObjects.erase(*I);
-      
+
       // If all of the allocas were clobbered by the call then we're not going
       // to find anything else to process.
       if (DeadStackObjects.empty())
         return MadeChange;
-      
+
       continue;
     }
-    
+
     AliasAnalysis::Location LoadedLoc;
-    
+
     // If we encounter a use of the pointer, it is no longer considered dead
     if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+      if (!L->isUnordered()) // Be conservative with atomic/volatile load
+        break;
       LoadedLoc = AA->getLocation(L);
     } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
       LoadedLoc = AA->getLocation(V);
     } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
       LoadedLoc = AA->getLocationForSource(MTI);
-    } else {
-      // Not a loading instruction.
+    } else if (!BBI->mayReadFromMemory()) {
+      // Instruction doesn't read memory.  Note that stores that weren't removed
+      // above will hit this case.
       continue;
+    } else {
+      // Unknown inst; assume it clobbers everything.
+      break;
     }
 
     // Remove any allocas from the DeadPointer set that are loaded, as this
@@ -689,7 +697,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     if (DeadStackObjects.empty())
       break;
   }
-  
+
   return MadeChange;
 }
 
@@ -703,14 +711,14 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
   // A constant can't be in the dead pointer set.
   if (isa<Constant>(UnderlyingPointer))
     return;
-  
+
   // If the kill pointer can be easily reduced to an alloca, don't bother doing
   // extraneous AA queries.
   if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
     DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer));
     return;
   }
-  
+
   SmallVector<Value*, 16> NowLive;
   for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
        E = DeadStackObjects.end(); I != E; ++I) {
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 3d3f17b..c0223d2 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -92,7 +92,7 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   // Hash in all of the operands as pointers.
   unsigned Res = 0;
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
-    Res ^= getHash(Inst->getOperand(i)) << i;
+    Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
 
   if (CastInst *CI = dyn_cast<CastInst>(Inst))
     Res ^= getHash(CI->getType());
@@ -185,7 +185,7 @@ unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) {
     assert(!Inst->getOperand(i)->getType()->isMetadataTy() &&
            "Cannot value number calls with metadata operands");
-    Res ^= getHash(Inst->getOperand(i)) << i;
+    Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
   }
   
   // Mix in the opcode.
@@ -357,7 +357,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // If this is a non-volatile load, process it.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
       // Ignore volatile loads.
-      if (LI->isVolatile()) {
+      if (!LI->isSimple()) {
         LastStore = 0;
         continue;
       }
@@ -437,7 +437,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
          std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration));
         
         // Remember that this was the last store we saw for DSE.
-        if (!SI->isVolatile())
+        if (SI->isSimple())
           LastStore = SI;
       }
     }
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index b4d5667..a51cbb6 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -41,12 +41,16 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/PatternMatch.h"
 using namespace llvm;
+using namespace PatternMatch;
 
 STATISTIC(NumGVNInstr,  "Number of instructions deleted");
 STATISTIC(NumGVNLoad,   "Number of loads deleted");
 STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
 STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl,  "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
 STATISTIC(NumPRELoad,   "Number of loads PRE'd");
 
 static cl::opt<bool> EnablePRE("enable-pre",
@@ -548,6 +552,9 @@ namespace {
     void cleanupGlobalSets();
     void verifyRemoved(const Instruction *I) const;
     bool splitCriticalEdges();
+    unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
+                                         BasicBlock *Root);
+    bool propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root);
   };
 
   char GVN::ID = 0;
@@ -689,8 +696,8 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
   
-  uint64_t StoreSize = TD.getTypeStoreSizeInBits(StoredValTy);
-  uint64_t LoadSize = TD.getTypeStoreSizeInBits(LoadedTy);
+  uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy);
   
   // If the store and reload are the same size, we can always reuse it.
   if (StoreSize == LoadSize) {
@@ -920,7 +927,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
   Constant *OffsetCst = 
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1);
+  Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
   if (ConstantFoldLoadFromConstPtr(Src, &TD))
     return Offset;
@@ -946,10 +953,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPointerTy())
-    SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx), "tmp");
+    SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx));
   if (!SrcVal->getType()->isIntegerTy())
-    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8),
-                                   "tmp");
+    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
   
   // Shift the bits to the least significant depending on endianness.
   unsigned ShiftAmt;
@@ -959,11 +965,10 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
     ShiftAmt = (StoreSize-LoadSize-Offset)*8;
   
   if (ShiftAmt)
-    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt, "tmp");
+    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
   
   if (LoadSize != StoreSize)
-    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8),
-                                 "tmp");
+    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
   
   return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
 }
@@ -982,8 +987,8 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
   unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType());
   unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
   if (Offset+LoadSize > SrcValSize) {
-    assert(!SrcVal->isVolatile() && "Cannot widen volatile load!");
-    assert(isa<IntegerType>(SrcVal->getType())&&"Can't widen non-integer load");
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
     // If we have a load/load clobber an DepLI can be widened to cover this
     // load, then we should widen it to the next power of 2 size big enough!
     unsigned NewLoadSize = Offset+LoadSize;
@@ -1081,7 +1086,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
   Constant *OffsetCst = 
   ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1);
+  Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
   return ConstantFoldLoadFromConstPtr(Src, &TD);
 }
@@ -1274,7 +1279,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
 
   // If we had a phi translation failure, we'll have a single entry which is a
   // clobber in the current block.  Reject this early.
-  if (Deps.size() == 1 && Deps[0].getResult().isUnknown()) {
+  if (Deps.size() == 1
+      && !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber())
+  {
     DEBUG(
       dbgs() << "GVN: non-local load ";
       WriteAsOperand(dbgs(), LI);
@@ -1294,7 +1301,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
 
-    if (DepInfo.isUnknown()) {
+    if (!DepInfo.isDef() && !DepInfo.isClobber()) {
       UnavailableBlocks.push_back(DepBB);
       continue;
     }
@@ -1359,7 +1366,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       continue;
     }
 
-    assert(DepInfo.isDef() && "Expecting def here");
+    // DepInfo.isDef() here
 
     Instruction *DepInst = DepInfo.getInst();
 
@@ -1446,8 +1453,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
     Blockers.insert(UnavailableBlocks[i]);
 
-  // Lets find first basic block with more than one predecessor.  Walk backwards
-  // through predecessors if needed.
+  // Let's find the first basic block with more than one predecessor.  Walk
+  // backwards through predecessors if needed.
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
 
@@ -1519,10 +1526,19 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
               << Pred->getName() << "': " << *LI << '\n');
         return false;
       }
+
+      if (LoadBB->isLandingPad()) {
+        DEBUG(dbgs()
+              << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
       unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB);
       NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum));
     }
   }
+
   if (!NeedToSplit.empty()) {
     toSplit.append(NeedToSplit.begin(), NeedToSplit.end());
     return false;
@@ -1660,7 +1676,7 @@ bool GVN::processLoad(LoadInst *L) {
   if (!MD)
     return false;
 
-  if (L->isVolatile())
+  if (!L->isSimple())
     return false;
 
   if (L->use_empty()) {
@@ -1747,7 +1763,11 @@ bool GVN::processLoad(LoadInst *L) {
     return false;
   }
 
-  if (Dep.isUnknown()) {
+  // If it is defined in another block, try harder.
+  if (Dep.isNonLocal())
+    return processNonLocalLoad(L);
+
+  if (!Dep.isDef()) {
     DEBUG(
       // fast print dep, using operator<< on instruction is too slow.
       dbgs() << "GVN: load ";
@@ -1757,12 +1777,6 @@ bool GVN::processLoad(LoadInst *L) {
     return false;
   }
 
-  // If it is defined in another block, try harder.
-  if (Dep.isNonLocal())
-    return processNonLocalLoad(L);
-
-  assert(Dep.isDef() && "Expecting def here");
-
   Instruction *DepInst = Dep.getInst();
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
     Value *StoredVal = DepSI->getValueOperand();
@@ -1874,6 +1888,138 @@ Value *GVN::findLeader(BasicBlock *BB, uint32_t num) {
   return Val;
 }
 
+/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the
+/// use is dominated by the given basic block.  Returns the number of uses that
+/// were replaced.
+unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
+                                          BasicBlock *Root) {
+  unsigned Count = 0;
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE; ) {
+    Instruction *User = cast<Instruction>(*UI);
+    unsigned OpNum = UI.getOperandNo();
+    ++UI;
+
+    if (DT->dominates(Root, User->getParent())) {
+      User->setOperand(OpNum, To);
+      ++Count;
+    }
+  }
+  return Count;
+}
+
+/// propagateEquality - The given values are known to be equal in every block
+/// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope.  Returns whether a change was made.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) {
+  if (LHS == RHS) return false;
+  assert(LHS->getType() == RHS->getType() && "Equal but types differ!");
+
+  // Don't try to propagate equalities between constants.
+  if (isa<Constant>(LHS) && isa<Constant>(RHS))
+    return false;
+
+  // Make sure that any constants are on the right-hand side.  In general the
+  // best results are obtained by placing the longest lived value on the RHS.
+  if (isa<Constant>(LHS))
+    std::swap(LHS, RHS);
+
+  // If neither term is constant then bail out.  This is not for correctness,
+  // it's just that the non-constant case is much less useful: it occurs just
+  // as often as the constant case but handling it hardly ever results in an
+  // improvement.
+  if (!isa<Constant>(RHS))
+    return false;
+
+  // If value numbering later deduces that an instruction in the scope is equal
+  // to 'LHS' then ensure it will be turned into 'RHS'.
+  addToLeaderTable(VN.lookup_or_add(LHS), RHS, Root);
+
+  // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
+  // LHS always has at least one use that is not dominated by Root, this will
+  // never do anything if LHS has only one use.
+  bool Changed = false;
+  if (!LHS->hasOneUse()) {
+    unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root);
+    Changed |= NumReplacements > 0;
+    NumGVNEqProp += NumReplacements;
+  }
+
+  // Now try to deduce additional equalities from this one.  For example, if the
+  // known equality was "(A != B)" == "false" then it follows that A and B are
+  // equal in the scope.  Only boolean equalities with an explicit true or false
+  // RHS are currently supported.
+  if (!RHS->getType()->isIntegerTy(1))
+    // Not a boolean equality - bail out.
+    return Changed;
+  ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+  if (!CI)
+    // RHS neither 'true' nor 'false' - bail out.
+    return Changed;
+  // Whether RHS equals 'true'.  Otherwise it equals 'false'.
+  bool isKnownTrue = CI->isAllOnesValue();
+  bool isKnownFalse = !isKnownTrue;
+
+  // If "A && B" is known true then both A and B are known true.  If "A || B"
+  // is known false then both A and B are known false.
+  Value *A, *B;
+  if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+      (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+    Changed |= propagateEquality(A, RHS, Root);
+    Changed |= propagateEquality(B, RHS, Root);
+    return Changed;
+  }
+
+  // If we are propagating an equality like "(A == B)" == "true" then also
+  // propagate the equality A == B.
+  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
+    // Only equality comparisons are supported.
+    if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+        (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) {
+      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+      Changed |= propagateEquality(Op0, Op1, Root);
+    }
+    return Changed;
+  }
+
+  return Changed;
+}
+
+/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'.  Return
+/// true if every path from the entry block to 'Dst' passes via this edge.  In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(BasicBlock *Src, BasicBlock *Dst,
+                                       DominatorTree *DT) {
+  // First off, there must not be more than one edge from Src to Dst, there
+  // should be exactly one.  So keep track of the number of times Src occurs
+  // as a predecessor of Dst and fail if it's more than once.  Secondly, any
+  // other predecessors of Dst should be dominated by Dst (see logic below).
+  bool SawEdgeFromSrc = false;
+  for (pred_iterator PI = pred_begin(Dst), PE = pred_end(Dst); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (Pred == Src) {
+      // An edge from Src to Dst.
+      if (SawEdgeFromSrc)
+        // There are multiple edges from Src to Dst - fail.
+        return false;
+      SawEdgeFromSrc = true;
+      continue;
+    }
+    // If the predecessor is not dominated by Dst, then it must be possible to
+    // reach it either without passing through Src (and thus not via the edge)
+    // or by passing through Src but taking a different edge out of Src.  Either
+    // way it is possible to reach Dst without passing via the edge, so fail.
+    if (!DT->dominates(Dst, *PI))
+      return false;
+  }
+  assert(SawEdgeFromSrc && "No edge between these basic blocks!");
+
+  // Every path from the entry block to Dst must at some point pass to Dst from
+  // a predecessor that is not dominated by Dst.  This predecessor can only be
+  // Src, since all others are dominated by Dst.  As there is only one edge from
+  // Src to Dst, the path passes by this edge.
+  return true;
+}
 
 /// processInstruction - When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets
@@ -1891,6 +2037,7 @@ bool GVN::processInstruction(Instruction *I) {
     if (MD && V->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
     markInstructionForDeletion(I);
+    ++NumGVNSimpl;
     return true;
   }
 
@@ -1903,30 +2050,45 @@ bool GVN::processInstruction(Instruction *I) {
     return false;
   }
 
-  // For conditions branches, we can perform simple conditional propagation on
+  // For conditional branches, we can perform simple conditional propagation on
   // the condition value itself.
   if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
     if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
       return false;
-    
+
     Value *BranchCond = BI->getCondition();
-    uint32_t CondVN = VN.lookup_or_add(BranchCond);
-  
+
     BasicBlock *TrueSucc = BI->getSuccessor(0);
     BasicBlock *FalseSucc = BI->getSuccessor(1);
-  
-    if (TrueSucc->getSinglePredecessor())
-      addToLeaderTable(CondVN,
-                   ConstantInt::getTrue(TrueSucc->getContext()),
-                   TrueSucc);
-    if (FalseSucc->getSinglePredecessor())
-      addToLeaderTable(CondVN,
-                   ConstantInt::getFalse(TrueSucc->getContext()),
-                   FalseSucc);
-    
-    return false;
+    BasicBlock *Parent = BI->getParent();
+    bool Changed = false;
+
+    if (isOnlyReachableViaThisEdge(Parent, TrueSucc, DT))
+      Changed |= propagateEquality(BranchCond,
+                                   ConstantInt::getTrue(TrueSucc->getContext()),
+                                   TrueSucc);
+
+    if (isOnlyReachableViaThisEdge(Parent, FalseSucc, DT))
+      Changed |= propagateEquality(BranchCond,
+                                   ConstantInt::getFalse(FalseSucc->getContext()),
+                                   FalseSucc);
+
+    return Changed;
   }
-  
+
+  // For switches, propagate the case values into the case destinations.
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+    Value *SwitchCond = SI->getCondition();
+    BasicBlock *Parent = SI->getParent();
+    bool Changed = false;
+    for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) {
+      BasicBlock *Dst = SI->getSuccessor(i);
+      if (isOnlyReachableViaThisEdge(Parent, Dst, DT))
+        Changed |= propagateEquality(SwitchCond, SI->getCaseValue(i), Dst);
+    }
+    return Changed;
+  }
+
   // Instructions with void type don't return a value, so there's
   // no point in trying to find redudancies in them.
   if (I->getType()->isVoidTy()) return false;
@@ -2071,6 +2233,9 @@ bool GVN::performPRE(Function &F) {
     // Nothing to PRE in the entry block.
     if (CurrentBlock == &F.getEntryBlock()) continue;
 
+    // Don't perform PRE on a landing pad.
+    if (CurrentBlock->isLandingPad()) continue;
+
     for (BasicBlock::iterator BI = CurrentBlock->begin(),
          BE = CurrentBlock->end(); BI != BE; ) {
       Instruction *CurInst = BI++;
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
new file mode 100644
index 0000000..0772b48
--- /dev/null
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -0,0 +1,226 @@
+//===-- GlobalMerge.cpp - Internal globals merging  -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass merges globals with internal linkage into one. This way all the
+// globals which were merged into a biggest one can be addressed using offsets
+// from the same base pointer (no need for separate base pointer for each of the
+// global). Such a transformation can significantly reduce the register pressure
+// when many globals are involved.
+//
+// For example, consider the code which touches several global variables at 
+// once:
+//
+// static int foo[N], bar[N], baz[N];
+//
+// for (i = 0; i < N; ++i) {
+//    foo[i] = bar[i] * baz[i];
+// }
+//
+//  On ARM the addresses of 3 arrays should be kept in the registers, thus
+//  this code has quite large register pressure (loop body):
+//
+//  ldr     r1, [r5], #4
+//  ldr     r2, [r6], #4
+//  mul     r1, r2, r1
+//  str     r1, [r0], #4
+//
+//  Pass converts the code to something like:
+//
+//  static struct {
+//    int foo[N];
+//    int bar[N];
+//    int baz[N];
+//  } merged;
+//
+//  for (i = 0; i < N; ++i) {
+//    merged.foo[i] = merged.bar[i] * merged.baz[i];
+//  }
+//
+//  and in ARM code this becomes:
+//
+//  ldr     r0, [r5, #40]
+//  ldr     r1, [r5, #80]
+//  mul     r0, r1, r0
+//  str     r0, [r5], #4
+//
+//  note that we saved 2 registers here almostly "for free".
+// ===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "global-merge"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Attributes.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumMerged      , "Number of globals merged");
+namespace {
+  class GlobalMerge : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// target type sizes.
+    const TargetLowering *TLI;
+
+    bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                 Module &M, bool isConst) const;
+
+  public:
+    static char ID;             // Pass identification, replacement for typeid.
+    explicit GlobalMerge(const TargetLowering *tli = 0)
+      : FunctionPass(ID), TLI(tli) {
+      initializeGlobalMergePass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+    const char *getPassName() const {
+      return "Merge internal globals";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+    struct GlobalCmp {
+      const TargetData *TD;
+
+      GlobalCmp(const TargetData *td) : TD(td) { }
+
+      bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) {
+        Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
+        Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
+
+        return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2));
+      }
+    };
+  };
+} // end anonymous namespace
+
+char GlobalMerge::ID = 0;
+INITIALIZE_PASS(GlobalMerge, "global-merge",
+                "Global Merge", false, false)
+
+
+bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                             Module &M, bool isConst) const {
+  const TargetData *TD = TLI->getTargetData();
+
+  // FIXME: Infer the maximum possible offset depending on the actual users
+  // (these max offsets are different for the users inside Thumb or ARM
+  // functions)
+  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
+
+  // FIXME: Find better heuristics
+  std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD));
+
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
+
+  for (size_t i = 0, e = Globals.size(); i != e; ) {
+    size_t j = 0;
+    uint64_t MergedSize = 0;
+    std::vector<Type*> Tys;
+    std::vector<Constant*> Inits;
+    for (j = i; j != e; ++j) {
+      Type *Ty = Globals[j]->getType()->getElementType();
+      MergedSize += TD->getTypeAllocSize(Ty);
+      if (MergedSize > MaxOffset) {
+        break;
+      }
+      Tys.push_back(Ty);
+      Inits.push_back(Globals[j]->getInitializer());
+    }
+
+    StructType *MergedTy = StructType::get(M.getContext(), Tys);
+    Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
+    GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst,
+                                                  GlobalValue::InternalLinkage,
+                                                  MergedInit, "_MergedGlobals");
+    for (size_t k = i; k < j; ++k) {
+      Constant *Idx[2] = {
+        ConstantInt::get(Int32Ty, 0),
+        ConstantInt::get(Int32Ty, k-i)
+      };
+      Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx);
+      Globals[k]->replaceAllUsesWith(GEP);
+      Globals[k]->eraseFromParent();
+      NumMerged++;
+    }
+    i = j;
+  }
+
+  return true;
+}
+
+
+bool GlobalMerge::doInitialization(Module &M) {
+  SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals;
+  const TargetData *TD = TLI->getTargetData();
+  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
+  bool Changed = false;
+
+  // Grab all non-const globals.
+  for (Module::global_iterator I = M.global_begin(),
+         E = M.global_end(); I != E; ++I) {
+    // Merge is safe for "normal" internal globals only
+    if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection())
+      continue;
+
+    // Ignore fancy-aligned globals for now.
+    unsigned Alignment = I->getAlignment();
+    Type *Ty = I->getType()->getElementType();
+    if (Alignment > TD->getABITypeAlignment(Ty))
+      continue;
+
+    // Ignore all 'special' globals.
+    if (I->getName().startswith("llvm.") ||
+        I->getName().startswith(".llvm."))
+      continue;
+
+    if (TD->getTypeAllocSize(Ty) < MaxOffset) {
+      const TargetLoweringObjectFile &TLOF = TLI->getObjFileLowering();
+      if (TLOF.getKindForGlobal(I, TLI->getTargetMachine()).isBSSLocal())
+        BSSGlobals.push_back(I);
+      else if (I->isConstant())
+        ConstGlobals.push_back(I);
+      else
+        Globals.push_back(I);
+    }
+  }
+
+  if (Globals.size() > 1)
+    Changed |= doMerge(Globals, M, false);
+  if (BSSGlobals.size() > 1)
+    Changed |= doMerge(BSSGlobals, M, false);
+
+  // FIXME: This currently breaks the EH processing due to way how the 
+  // typeinfo detection works. We might want to detect the TIs and ignore 
+  // them in the future.
+  // if (ConstGlobals.size() > 1)
+  //  Changed |= doMerge(ConstGlobals, M, true);
+
+  return Changed;
+}
+
+bool GlobalMerge::runOnFunction(Function &F) {
+  return false;
+}
+
+Pass *llvm::createGlobalMergePass(const TargetLowering *tli) {
+  return new GlobalMerge(tli);
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 50140d9..75fa011 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -11,17 +11,6 @@
 // computations derived from them) into simpler forms suitable for subsequent
 // analysis and transformation.
 //
-// This transformation makes the following changes to each loop with an
-// identifiable induction variable:
-//   1. All loops are transformed to have a SINGLE canonical induction variable
-//      which starts at zero and steps by one.
-//   2. The canonical induction variable is guaranteed to be the first PHI node
-//      in the loop header block.
-//   3. The canonical induction variable is guaranteed to be in a wide enough
-//      type so that IV expressions need not be (directly) zero-extended or
-//      sign-extended.
-//   4. Any pointer arithmetic recurrences are raised to use array subscripts.
-//
 // If the trip count of a loop is computable, this pass also makes the following
 // changes:
 //   1. The exit condition for the loop is canonicalized to compare the
@@ -33,9 +22,6 @@
 //      purpose of the loop is to compute the exit value of some derived
 //      expression, this transformation will make the loop dead.
 //
-// This transformation should be followed by strength reduction after all of the
-// desired loop transformations have been performed.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "indvars"
@@ -57,11 +43,11 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 STATISTIC(NumRemoved     , "Number of aux indvars removed");
@@ -69,21 +55,21 @@ STATISTIC(NumWidened     , "Number of indvars widened");
 STATISTIC(NumInserted    , "Number of canonical indvars added");
 STATISTIC(NumReplaced    , "Number of exit values replaced");
 STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
-STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
 STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated");
-STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
-STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
 STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
 
-static cl::opt<bool> DisableIVRewrite(
-  "disable-iv-rewrite", cl::Hidden,
-  cl::desc("Disable canonical induction variable rewriting"));
-
-// Temporary flag for use with -disable-iv-rewrite to force a canonical IV for
-// LFTR purposes.
-static cl::opt<bool> ForceLFTR(
-  "force-lftr", cl::Hidden,
-  cl::desc("Enable forced linear function test replacement"));
+namespace llvm {
+  cl::opt<bool> EnableIVRewrite(
+    "enable-iv-rewrite", cl::Hidden,
+    cl::desc("Enable canonical induction variable rewriting"));
+
+  // Trip count verification can be enabled by default under NDEBUG if we
+  // implement a strong expression equivalence checker in SCEV. Until then, we
+  // use the verify-indvars flag, which may assert in some cases.
+  cl::opt<bool> VerifyIndvars(
+    "verify-indvars", cl::Hidden,
+    cl::desc("Verify the ScalarEvolution result after running indvars"));
+}
 
 namespace {
   class IndVarSimplify : public LoopPass {
@@ -111,12 +97,12 @@ namespace {
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
-      if (!DisableIVRewrite)
+      if (EnableIVRewrite)
         AU.addRequired<IVUsers>();
       AU.addPreserved<ScalarEvolution>();
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
-      if (!DisableIVRewrite)
+      if (EnableIVRewrite)
         AU.addPreserved<IVUsers>();
       AU.setPreservesCFG();
     }
@@ -131,18 +117,9 @@ namespace {
     void HandleFloatingPointIV(Loop *L, PHINode *PH);
     void RewriteNonIntegerIVs(Loop *L);
 
-    void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
-
-    void SimplifyIVUsers(SCEVExpander &Rewriter);
-    void SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter);
-
-    bool EliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
-    void EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
-    void EliminateIVRemainder(BinaryOperator *Rem,
-                              Value *IVOperand,
-                              bool IsSigned);
+    void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM);
 
-    void SimplifyCongruentIVs(Loop *L);
+    void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
 
     void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter);
 
@@ -240,8 +217,8 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
     InsertPt = InsertBB->getTerminator();
   }
   assert(InsertPt && "Missing phi operand");
-  assert(!isa<Instruction>(Def) ||
-         DT->dominates(cast<Instruction>(Def), InsertPt) &&
+  assert((!isa<Instruction>(Def) ||
+          DT->dominates(cast<Instruction>(Def), InsertPt)) &&
          "def does not dominate all uses");
   return InsertPt;
 }
@@ -372,14 +349,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // Positive and negative strides have different safety conditions.
   if (IncValue > 0) {
     // If we have a positive stride, we require the init to be less than the
-    // exit value and an equality or less than comparison.
-    if (InitValue >= ExitValue ||
-        NewPred == CmpInst::ICMP_SGT || NewPred == CmpInst::ICMP_SGE)
+    // exit value.
+    if (InitValue >= ExitValue)
       return;
 
     uint32_t Range = uint32_t(ExitValue-InitValue);
-    if (NewPred == CmpInst::ICMP_SLE) {
-      // Normalize SLE -> SLT, check for infinite loop.
+    // Check for infinite loop, either:
+    // while (i <= Exit) or until (i > Exit)
+    if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
       if (++Range == 0) return;  // Range overflows.
     }
 
@@ -399,14 +376,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
 
   } else {
     // If we have a negative stride, we require the init to be greater than the
-    // exit value and an equality or greater than comparison.
-    if (InitValue >= ExitValue ||
-        NewPred == CmpInst::ICMP_SLT || NewPred == CmpInst::ICMP_SLE)
+    // exit value.
+    if (InitValue <= ExitValue)
       return;
 
     uint32_t Range = uint32_t(InitValue-ExitValue);
-    if (NewPred == CmpInst::ICMP_SGE) {
-      // Normalize SGE -> SGT, check for infinite loop.
+    // Check for infinite loop, either:
+    // while (i >= Exit) or until (i < Exit)
+    if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
       if (++Range == 0) return;  // Range overflows.
     }
 
@@ -464,7 +441,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // platforms.
   if (WeakPH) {
     Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
-                                 PN->getParent()->getFirstNonPHI());
+                                 PN->getParent()->getFirstInsertionPt());
     PN->replaceAllUsesWith(Conv);
     RecursivelyDeleteTriviallyDeadInstructions(PN);
   }
@@ -472,6 +449,8 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // Add a new IVUsers entry for the newly-created integer PHI.
   if (IU)
     IU->AddUsersIfInteresting(NewPHI);
+
+  Changed = true;
 }
 
 void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
@@ -617,45 +596,15 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 
 //===----------------------------------------------------------------------===//
 //  Rewrite IV users based on a canonical IV.
-//  To be replaced by -disable-iv-rewrite.
+//  Only for use with -enable-iv-rewrite.
 //===----------------------------------------------------------------------===//
 
-/// SimplifyIVUsers - Iteratively perform simplification on IVUsers within this
-/// loop. IVUsers is treated as a worklist. Each successive simplification may
-/// push more users which may themselves be candidates for simplification.
-///
-/// This is the old approach to IV simplification to be replaced by
-/// SimplifyIVUsersNoRewrite.
-///
-void IndVarSimplify::SimplifyIVUsers(SCEVExpander &Rewriter) {
-  // Each round of simplification involves a round of eliminating operations
-  // followed by a round of widening IVs. A single IVUsers worklist is used
-  // across all rounds. The inner loop advances the user. If widening exposes
-  // more uses, then another pass through the outer loop is triggered.
-  for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) {
-    Instruction *UseInst = I->getUser();
-    Value *IVOperand = I->getOperandValToReplace();
-
-    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-      EliminateIVComparison(ICmp, IVOperand);
-      continue;
-    }
-    if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-      bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-      if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-        EliminateIVRemainder(Rem, IVOperand, IsSigned);
-        continue;
-      }
-    }
-  }
-}
-
-// FIXME: It is an extremely bad idea to indvar substitute anything more
-// complex than affine induction variables.  Doing so will put expensive
-// polynomial evaluations inside of the loop, and the str reduction pass
-// currently can only reduce affine polynomials.  For now just disable
-// indvar subst on anything more complex than an affine addrec, unless
-// it can be expanded to a trivial value.
+/// FIXME: It is an extremely bad idea to indvar substitute anything more
+/// complex than affine induction variables.  Doing so will put expensive
+/// polynomial evaluations inside of the loop, and the str reduction pass
+/// currently can only reduce affine polynomials.  For now just disable
+/// indvar subst on anything more complex than an affine addrec, unless
+/// it can be expanded to a trivial value.
 static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) {
   // Loop-invariant values are safe.
   if (SE->isLoopInvariant(S, L)) return true;
@@ -666,7 +615,8 @@ static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) {
     return AR->isAffine();
 
   // An add is safe it all its operands are safe.
-  if (const SCEVCommutativeExpr *Commutative = dyn_cast<SCEVCommutativeExpr>(S)) {
+  if (const SCEVCommutativeExpr *Commutative
+      = dyn_cast<SCEVCommutativeExpr>(S)) {
     for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(),
          E = Commutative->op_end(); I != E; ++I)
       if (!isSafe(*I, L, SE)) return false;
@@ -771,18 +721,37 @@ namespace {
   // extend operations. This information is recorded by CollectExtend and
   // provides the input to WidenIV.
   struct WideIVInfo {
+    PHINode *NarrowIV;
     Type *WidestNativeType; // Widest integer type created [sz]ext
-    bool IsSigned;                // Was an sext user seen before a zext?
+    bool IsSigned;          // Was an sext user seen before a zext?
+
+    WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {}
+  };
+
+  class WideIVVisitor : public IVVisitor {
+    ScalarEvolution *SE;
+    const TargetData *TD;
 
-    WideIVInfo() : WidestNativeType(0), IsSigned(false) {}
+  public:
+    WideIVInfo WI;
+
+    WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV,
+                  const TargetData *TData) :
+      SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; }
+
+    // Implement the interface used by simplifyUsersOfIV.
+    virtual void visitCast(CastInst *Cast);
   };
 }
 
-/// CollectExtend - Update information about the induction variable that is
+/// visitCast - Update information about the induction variable that is
 /// extended by this sign or zero extend operation. This is used to determine
 /// the final width of the IV before actually widening it.
-static void CollectExtend(CastInst *Cast, bool IsSigned, WideIVInfo &WI,
-                          ScalarEvolution *SE, const TargetData *TD) {
+void WideIVVisitor::visitCast(CastInst *Cast) {
+  bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+  if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+    return;
+
   Type *Ty = Cast->getType();
   uint64_t Width = SE->getTypeSizeInBits(Ty);
   if (TD && !TD->isLegalInteger(Width))
@@ -845,10 +814,10 @@ class WidenIV {
   SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
 
 public:
-  WidenIV(PHINode *PN, const WideIVInfo &WI, LoopInfo *LInfo,
+  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo,
           ScalarEvolution *SEv, DominatorTree *DTree,
           SmallVectorImpl<WeakVH> &DI) :
-    OrigPhi(PN),
+    OrigPhi(WI.NarrowIV),
     WideType(WI.WidestNativeType),
     IsSigned(WI.IsSigned),
     LI(LInfo),
@@ -865,18 +834,42 @@ public:
   PHINode *CreateWideIV(SCEVExpander &Rewriter);
 
 protected:
+  Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
+                   Instruction *Use);
+
   Instruction *CloneIVUser(NarrowIVDefUse DU);
 
   const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse);
 
+  const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU);
+
   Instruction *WidenIVUse(NarrowIVDefUse DU);
 
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
 } // anonymous namespace
 
-static Value *getExtend( Value *NarrowOper, Type *WideType,
-                               bool IsSigned, IRBuilder<> &Builder) {
+/// isLoopInvariant - Perform a quick domtree based check for loop invariance
+/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems
+/// gratuitous for this purpose.
+static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return true;
+
+  return DT->properlyDominates(Inst->getParent(), L->getHeader());
+}
+
+Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
+                          Instruction *Use) {
+  // Set the debug location and conservative insertion point.
+  IRBuilder<> Builder(Use);
+  // Hoist the insertion point into loop preheaders as far as possible.
+  for (const Loop *L = LI->getLoopFor(Use->getParent());
+       L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT);
+       L = L->getParentLoop())
+    Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+
   return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
                     Builder.CreateZExt(NarrowOper, WideType);
 }
@@ -901,22 +894,21 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
   case Instruction::AShr:
     DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n");
 
-    IRBuilder<> Builder(DU.NarrowUse);
-
     // Replace NarrowDef operands with WideDef. Otherwise, we don't know
     // anything about the narrow operand yet so must insert a [sz]ext. It is
     // probably loop invariant and will be folded or hoisted. If it actually
     // comes from a widened IV, it should be removed during a future call to
     // WidenIVUse.
     Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef :
-      getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, Builder);
+      getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse);
     Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef :
-      getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, Builder);
+      getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse);
 
     BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse);
     BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(),
                                                     LHS, RHS,
                                                     NarrowBO->getName());
+    IRBuilder<> Builder(DU.NarrowUse);
     Builder.Insert(WideBO);
     if (const OverflowingBinaryOperator *OBO =
         dyn_cast<OverflowingBinaryOperator>(NarrowBO)) {
@@ -928,45 +920,46 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
   llvm_unreachable(0);
 }
 
-/// HoistStep - Attempt to hoist an IV increment above a potential use.
-///
-/// To successfully hoist, two criteria must be met:
-/// - IncV operands dominate InsertPos and
-/// - InsertPos dominates IncV
-///
-/// Meeting the second condition means that we don't need to check all of IncV's
-/// existing uses (it's moving up in the domtree).
-///
-/// This does not yet recursively hoist the operands, although that would
-/// not be difficult.
-static bool HoistStep(Instruction *IncV, Instruction *InsertPos,
-                      const DominatorTree *DT)
-{
-  if (DT->dominates(IncV, InsertPos))
-    return true;
+/// No-wrap operations can transfer sign extension of their result to their
+/// operands. Generate the SCEV value for the widened operation without
+/// actually modifying the IR yet. If the expression after extending the
+/// operands is an AddRec for this loop, return it.
+const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
+  // Handle the common case of add<nsw/nuw>
+  if (DU.NarrowUse->getOpcode() != Instruction::Add)
+    return 0;
 
-  if (!DT->dominates(InsertPos->getParent(), IncV->getParent()))
-    return false;
+  // One operand (NarrowDef) has already been extended to WideDef. Now determine
+  // if extending the other will lead to a recurrence.
+  unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+  const SCEV *ExtendOperExpr = 0;
+  const OverflowingBinaryOperator *OBO =
+    cast<OverflowingBinaryOperator>(DU.NarrowUse);
+  if (IsSigned && OBO->hasNoSignedWrap())
+    ExtendOperExpr = SE->getSignExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else if(!IsSigned && OBO->hasNoUnsignedWrap())
+    ExtendOperExpr = SE->getZeroExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else
+    return 0;
 
-  if (IncV->mayHaveSideEffects())
-    return false;
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(
+    SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr,
+                   IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW));
 
-  // Attempt to hoist IncV
-  for (User::op_iterator OI = IncV->op_begin(), OE = IncV->op_end();
-       OI != OE; ++OI) {
-    Instruction *OInst = dyn_cast<Instruction>(OI);
-    if (OInst && !DT->dominates(OInst, InsertPos))
-      return false;
-  }
-  IncV->moveBefore(InsertPos);
-  return true;
+  if (!AddRec || AddRec->getLoop() != L)
+    return 0;
+  return AddRec;
 }
 
-// GetWideRecurrence - Is this instruction potentially interesting from IVUsers'
-// perspective after widening it's type? In other words, can the extend be
-// safely hoisted out of the loop with SCEV reducing the value to a recurrence
-// on the same loop. If so, return the sign or zero extended
-// recurrence. Otherwise return NULL.
+/// GetWideRecurrence - Is this instruction potentially interesting from
+/// IVUsers' perspective after widening it's type? In other words, can the
+/// extend be safely hoisted out of the loop with SCEV reducing the value to a
+/// recurrence on the same loop. If so, return the sign or zero extended
+/// recurrence. Otherwise return NULL.
 const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
   if (!SE->isSCEVable(NarrowUse->getType()))
     return 0;
@@ -985,7 +978,6 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
   if (!AddRec || AddRec->getLoop() != L)
     return 0;
-
   return AddRec;
 }
 
@@ -1039,6 +1031,9 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU) {
   // Does this user itself evaluate to a recurrence after widening?
   const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse);
   if (!WideAddRec) {
+      WideAddRec = GetExtendedOperandRecurrence(DU);
+  }
+  if (!WideAddRec) {
     // This user does not evaluate to a recurence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
@@ -1055,9 +1050,9 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU) {
   // Reuse the IV increment that SCEVExpander created as long as it dominates
   // NarrowUse.
   Instruction *WideUse = 0;
-  if (WideAddRec == WideIncExpr && HoistStep(WideInc, DU.NarrowUse, DT)) {
+  if (WideAddRec == WideIncExpr
+      && SCEVExpander::hoistStep(WideInc, DU.NarrowUse, DT))
     WideUse = WideInc;
-  }
   else {
     WideUse = CloneIVUser(DU);
     if (!WideUse)
@@ -1178,183 +1173,17 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
 //  Simplification of IV users based on SCEV evaluation.
 //===----------------------------------------------------------------------===//
 
-void IndVarSimplify::EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
-  unsigned IVOperIdx = 0;
-  ICmpInst::Predicate Pred = ICmp->getPredicate();
-  if (IVOperand != ICmp->getOperand(0)) {
-    // Swapped
-    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
-    IVOperIdx = 1;
-    Pred = ICmpInst::getSwappedPredicate(Pred);
-  }
-
-  // Get the SCEVs for the ICmp operands.
-  const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx));
-  const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx));
-
-  // Simplify unnecessary loops away.
-  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
-  S = SE->getSCEVAtScope(S, ICmpLoop);
-  X = SE->getSCEVAtScope(X, ICmpLoop);
-
-  // If the condition is always true or always false, replace it with
-  // a constant value.
-  if (SE->isKnownPredicate(Pred, S, X))
-    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
-  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
-    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
-  else
-    return;
 
-  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
-  ++NumElimCmp;
-  Changed = true;
-  DeadInsts.push_back(ICmp);
-}
-
-void IndVarSimplify::EliminateIVRemainder(BinaryOperator *Rem,
-                                          Value *IVOperand,
-                                          bool IsSigned) {
-  // We're only interested in the case where we know something about
-  // the numerator.
-  if (IVOperand != Rem->getOperand(0))
-    return;
-
-  // Get the SCEVs for the ICmp operands.
-  const SCEV *S = SE->getSCEV(Rem->getOperand(0));
-  const SCEV *X = SE->getSCEV(Rem->getOperand(1));
-
-  // Simplify unnecessary loops away.
-  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
-  S = SE->getSCEVAtScope(S, ICmpLoop);
-  X = SE->getSCEVAtScope(X, ICmpLoop);
-
-  // i % n  -->  i  if i is in [0,n).
-  if ((!IsSigned || SE->isKnownNonNegative(S)) &&
-      SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                           S, X))
-    Rem->replaceAllUsesWith(Rem->getOperand(0));
-  else {
-    // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
-    const SCEV *LessOne =
-      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
-    if (IsSigned && !SE->isKnownNonNegative(LessOne))
-      return;
-
-    if (!SE->isKnownPredicate(IsSigned ?
-                              ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                              LessOne, X))
-      return;
-
-    ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
-                                  Rem->getOperand(0), Rem->getOperand(1),
-                                  "tmp");
-    SelectInst *Sel =
-      SelectInst::Create(ICmp,
-                         ConstantInt::get(Rem->getType(), 0),
-                         Rem->getOperand(0), "tmp", Rem);
-    Rem->replaceAllUsesWith(Sel);
-  }
-
-  // Inform IVUsers about the new users.
-  if (IU) {
-    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
-      IU->AddUsersIfInteresting(I);
-  }
-  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
-  ++NumElimRem;
-  Changed = true;
-  DeadInsts.push_back(Rem);
-}
-
-/// EliminateIVUser - Eliminate an operation that consumes a simple IV and has
-/// no observable side-effect given the range of IV values.
-bool IndVarSimplify::EliminateIVUser(Instruction *UseInst,
-                                     Instruction *IVOperand) {
-  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-    EliminateIVComparison(ICmp, IVOperand);
-    return true;
-  }
-  if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-    bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-    if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-      EliminateIVRemainder(Rem, IVOperand, IsSigned);
-      return true;
-    }
-  }
-
-  // Eliminate any operation that SCEV can prove is an identity function.
-  if (!SE->isSCEVable(UseInst->getType()) ||
-      (UseInst->getType() != IVOperand->getType()) ||
-      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
-    return false;
-
-  DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
-
-  UseInst->replaceAllUsesWith(IVOperand);
-  ++NumElimIdentity;
-  Changed = true;
-  DeadInsts.push_back(UseInst);
-  return true;
-}
-
-/// pushIVUsers - Add all uses of Def to the current IV's worklist.
-///
-static void pushIVUsers(
-  Instruction *Def,
-  SmallPtrSet<Instruction*,16> &Simplified,
-  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
-
-  for (Value::use_iterator UI = Def->use_begin(), E = Def->use_end();
-       UI != E; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
-
-    // Avoid infinite or exponential worklist processing.
-    // Also ensure unique worklist users.
-    // If Def is a LoopPhi, it may not be in the Simplified set, so check for
-    // self edges first.
-    if (User != Def && Simplified.insert(User))
-      SimpleIVUsers.push_back(std::make_pair(User, Def));
-  }
-}
-
-/// isSimpleIVUser - Return true if this instruction generates a simple SCEV
-/// expression in terms of that IV.
-///
-/// This is similar to IVUsers' isInsteresting() but processes each instruction
-/// non-recursively when the operand is already known to be a simpleIVUser.
-///
-static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
-  if (!SE->isSCEVable(I->getType()))
-    return false;
-
-  // Get the symbolic expression for this instruction.
-  const SCEV *S = SE->getSCEV(I);
-
-  // Only consider affine recurrences.
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
-  if (AR && AR->getLoop() == L)
-    return true;
-
-  return false;
-}
-
-/// SimplifyIVUsersNoRewrite - Iteratively perform simplification on a worklist
-/// of IV users. Each successive simplification may push more users which may
+/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV
+/// users. Each successive simplification may push more users which may
 /// themselves be candidates for simplification.
 ///
-/// The "NoRewrite" algorithm does not require IVUsers analysis. Instead, it
-/// simplifies instructions in-place during analysis. Rather than rewriting
-/// induction variables bottom-up from their users, it transforms a chain of
-/// IVUsers top-down, updating the IR only when it encouters a clear
-/// optimization opportunitiy. A SCEVExpander "Rewriter" instance is still
-/// needed, but only used to generate a new IV (phi) of wider type for sign/zero
-/// extend elimination.
+/// Sign/Zero extend elimination is interleaved with IV simplification.
 ///
-/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
-///
-void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) {
-  std::map<PHINode *, WideIVInfo> WideIVMap;
+void IndVarSimplify::SimplifyAndExtend(Loop *L,
+                                       SCEVExpander &Rewriter,
+                                       LPPassManager &LPM) {
+  SmallVector<WideIVInfo, 8> WideIVs;
 
   SmallVector<PHINode*, 8> LoopPhis;
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
@@ -1370,108 +1199,27 @@ void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) {
     // extension. The first time SCEV attempts to normalize sign/zero extension,
     // the result becomes final. So for the most predictable results, we delay
     // evaluation of sign/zero extend evaluation until needed, and avoid running
-    // other SCEV based analysis prior to SimplifyIVUsersNoRewrite.
+    // other SCEV based analysis prior to SimplifyAndExtend.
     do {
       PHINode *CurrIV = LoopPhis.pop_back_val();
 
       // Information about sign/zero extensions of CurrIV.
-      WideIVInfo WI;
-
-      // Instructions processed by SimplifyIVUsers for CurrIV.
-      SmallPtrSet<Instruction*,16> Simplified;
+      WideIVVisitor WIV(CurrIV, SE, TD);
 
-      // Use-def pairs if IV users waiting to be processed for CurrIV.
-      SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
+      Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV);
 
-      // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
-      // called multiple times for the same LoopPhi. This is the proper thing to
-      // do for loop header phis that use each other.
-      pushIVUsers(CurrIV, Simplified, SimpleIVUsers);
-
-      while (!SimpleIVUsers.empty()) {
-        Instruction *UseInst, *Operand;
-        tie(UseInst, Operand) = SimpleIVUsers.pop_back_val();
-        // Bypass back edges to avoid extra work.
-        if (UseInst == CurrIV) continue;
-
-        if (EliminateIVUser(UseInst, Operand)) {
-          pushIVUsers(Operand, Simplified, SimpleIVUsers);
-          continue;
-        }
-        if (CastInst *Cast = dyn_cast<CastInst>(UseInst)) {
-          bool IsSigned = Cast->getOpcode() == Instruction::SExt;
-          if (IsSigned || Cast->getOpcode() == Instruction::ZExt) {
-            CollectExtend(Cast, IsSigned, WI, SE, TD);
-          }
-          continue;
-        }
-        if (isSimpleIVUser(UseInst, L, SE)) {
-          pushIVUsers(UseInst, Simplified, SimpleIVUsers);
-        }
-      }
-      if (WI.WidestNativeType) {
-        WideIVMap[CurrIV] = WI;
+      if (WIV.WI.WidestNativeType) {
+        WideIVs.push_back(WIV.WI);
       }
     } while(!LoopPhis.empty());
 
-    for (std::map<PHINode *, WideIVInfo>::const_iterator I = WideIVMap.begin(),
-           E = WideIVMap.end(); I != E; ++I) {
-      WidenIV Widener(I->first, I->second, LI, SE, DT, DeadInsts);
+    for (; !WideIVs.empty(); WideIVs.pop_back()) {
+      WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts);
       if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) {
         Changed = true;
         LoopPhis.push_back(WidePhi);
       }
     }
-    WideIVMap.clear();
-  }
-}
-
-/// SimplifyCongruentIVs - Check for congruent phis in this loop header and
-/// populate ExprToIVMap for use later.
-///
-void IndVarSimplify::SimplifyCongruentIVs(Loop *L) {
-  DenseMap<const SCEV *, PHINode *> ExprToIVMap;
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-    PHINode *Phi = cast<PHINode>(I);
-    if (!SE->isSCEVable(Phi->getType()))
-      continue;
-
-    const SCEV *S = SE->getSCEV(Phi);
-    DenseMap<const SCEV *, PHINode *>::const_iterator Pos;
-    bool Inserted;
-    tie(Pos, Inserted) = ExprToIVMap.insert(std::make_pair(S, Phi));
-    if (Inserted)
-      continue;
-    PHINode *OrigPhi = Pos->second;
-
-    // If one phi derives from the other via GEPs, types may differ.
-    if (OrigPhi->getType() != Phi->getType())
-      continue;
-
-    // Replacing the congruent phi is sufficient because acyclic redundancy
-    // elimination, CSE/GVN, should handle the rest. However, once SCEV proves
-    // that a phi is congruent, it's almost certain to be the head of an IV
-    // user cycle that is isomorphic with the original phi. So it's worth
-    // eagerly cleaning up the common case of a single IV increment.
-    if (BasicBlock *LatchBlock = L->getLoopLatch()) {
-      Instruction *OrigInc =
-        cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
-      Instruction *IsomorphicInc =
-        cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
-      if (OrigInc != IsomorphicInc &&
-          OrigInc->getType() == IsomorphicInc->getType() &&
-          SE->getSCEV(OrigInc) == SE->getSCEV(IsomorphicInc) &&
-          HoistStep(OrigInc, IsomorphicInc, DT)) {
-        DEBUG(dbgs() << "INDVARS: Eliminated congruent iv.inc: "
-              << *IsomorphicInc << '\n');
-        IsomorphicInc->replaceAllUsesWith(OrigInc);
-        DeadInsts.push_back(IsomorphicInc);
-      }
-    }
-    DEBUG(dbgs() << "INDVARS: Eliminated congruent iv: " << *Phi << '\n');
-    ++NumElimIV;
-    Phi->replaceAllUsesWith(OrigPhi);
-    DeadInsts.push_back(Phi);
   }
 }
 
@@ -1479,9 +1227,9 @@ void IndVarSimplify::SimplifyCongruentIVs(Loop *L) {
 //  LinearFunctionTestReplace and its kin. Rewrite the loop exit condition.
 //===----------------------------------------------------------------------===//
 
-// Check for expressions that ScalarEvolution generates to compute
-// BackedgeTakenInfo. If these expressions have not been reduced, then expanding
-// them may incur additional cost (albeit in the loop preheader).
+/// Check for expressions that ScalarEvolution generates to compute
+/// BackedgeTakenInfo. If these expressions have not been reduced, then
+/// expanding them may incur additional cost (albeit in the loop preheader).
 static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
                                 ScalarEvolution *SE) {
   // If the backedge-taken count is a UDiv, it's very likely a UDiv that
@@ -1502,7 +1250,7 @@ static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
     }
   }
 
-  if (!DisableIVRewrite || ForceLFTR)
+  if (EnableIVRewrite)
     return false;
 
   // Recurse past add expressions, which commonly occur in the
@@ -1580,17 +1328,6 @@ static Type *getBackedgeIVType(Loop *L) {
   return Ty;
 }
 
-/// isLoopInvariant - Perform a quick domtree based check for loop invariance
-/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems
-/// gratuitous for this purpose.
-static bool isLoopInvariant(Value *V, Loop *L, DominatorTree *DT) {
-  Instruction *Inst = dyn_cast<Instruction>(V);
-  if (!Inst)
-    return true;
-
-  return DT->properlyDominates(Inst->getParent(), L->getHeader());
-}
-
 /// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop
 /// invariant value to the phi.
 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
@@ -1779,10 +1516,9 @@ LinearFunctionTestReplace(Loop *L,
   assert(canExpandBackedgeTakenCount(L, SE) && "precondition");
   BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
 
-  // In DisableIVRewrite mode, IndVar is not necessarily a canonical IV. In this
-  // mode, LFTR can ignore IV overflow and truncate to the width of
+  // LFTR can ignore IV overflow and truncate to the width of
   // BECount. This avoids materializing the add(zext(add)) expression.
-  Type *CntTy = DisableIVRewrite ?
+  Type *CntTy = !EnableIVRewrite ?
     BackedgeTakenCount->getType() : IndVar->getType();
 
   const SCEV *IVLimit = BackedgeTakenCount;
@@ -1832,7 +1568,7 @@ LinearFunctionTestReplace(Loop *L,
     const SCEV *IVInit = AR->getStart();
 
     // For pointer types, sign extend BECount in order to materialize a GEP.
-    // Note that for DisableIVRewrite, we never run SCEVExpander on a
+    // Note that for without EnableIVRewrite, we never run SCEVExpander on a
     // pointer type, because we must preserve the existing GEPs. Instead we
     // directly generate a GEP later.
     if (IVInit->getType()->isPointerTy()) {
@@ -1919,7 +1655,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) return;
 
-  Instruction *InsertPt = ExitBlock->getFirstNonPHI();
+  Instruction *InsertPt = ExitBlock->getFirstInsertionPt();
   BasicBlock::iterator I = Preheader->getTerminator();
   while (I != Preheader->begin()) {
     --I;
@@ -1940,6 +1676,10 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
     if (isa<DbgInfoIntrinsic>(I))
       continue;
 
+    // Skip landingpad instructions.
+    if (isa<LandingPadInst>(I))
+      continue;
+
     // Don't sink static AllocaInsts out of the entry block, which would
     // turn them into dynamic allocas!
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
@@ -2006,7 +1746,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!L->isLoopSimplifyForm())
     return false;
 
-  if (!DisableIVRewrite)
+  if (EnableIVRewrite)
     IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
@@ -2024,6 +1764,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Create a rewriter object which we'll use to transform the code with.
   SCEVExpander Rewriter(*SE, "indvars");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
 
   // Eliminate redundant IV users.
   //
@@ -2031,9 +1774,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // attempt to avoid evaluating SCEVs for sign/zero extend operations until
   // other expressions involving loop IVs have been evaluated. This helps SCEV
   // set no-wrap flags before normalizing sign/zero extension.
-  if (DisableIVRewrite) {
+  if (!EnableIVRewrite) {
     Rewriter.disableCanonicalMode();
-    SimplifyIVUsersNoRewrite(L, Rewriter);
+    SimplifyAndExtend(L, Rewriter, LPM);
   }
 
   // Check to see if this loop has a computable loop-invariant execution count.
@@ -2046,26 +1789,25 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
     RewriteLoopExitValues(L, Rewriter);
 
   // Eliminate redundant IV users.
-  if (!DisableIVRewrite)
-    SimplifyIVUsers(Rewriter);
+  if (EnableIVRewrite)
+    Changed |= simplifyIVUsers(IU, SE, &LPM, DeadInsts);
 
   // Eliminate redundant IV cycles.
-  if (DisableIVRewrite)
-    SimplifyCongruentIVs(L);
+  if (!EnableIVRewrite)
+    NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
 
   // Compute the type of the largest recurrence expression, and decide whether
   // a canonical induction variable should be inserted.
   Type *LargestType = 0;
   bool NeedCannIV = false;
-  bool ReuseIVForExit = DisableIVRewrite && !ForceLFTR;
   bool ExpandBECount = canExpandBackedgeTakenCount(L, SE);
-  if (ExpandBECount && !ReuseIVForExit) {
+  if (EnableIVRewrite && ExpandBECount) {
     // If we have a known trip count and a single exit block, we'll be
     // rewriting the loop exit test condition below, which requires a
     // canonical induction variable.
     NeedCannIV = true;
     Type *Ty = BackedgeTakenCount->getType();
-    if (DisableIVRewrite) {
+    if (!EnableIVRewrite) {
       // In this mode, SimplifyIVUsers may have already widened the IV used by
       // the backedge test and inserted a Trunc on the compare's operand. Get
       // the wider type to avoid creating a redundant narrow IV only used by the
@@ -2077,7 +1819,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
         SE->getTypeSizeInBits(LargestType))
       LargestType = SE->getEffectiveSCEVType(Ty);
   }
-  if (!DisableIVRewrite) {
+  if (EnableIVRewrite) {
     for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
       NeedCannIV = true;
       Type *Ty =
@@ -2119,10 +1861,10 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
     // the end of the pass.
     while (!OldCannIVs.empty()) {
       PHINode *OldCannIV = OldCannIVs.pop_back_val();
-      OldCannIV->insertBefore(L->getHeader()->getFirstNonPHI());
+      OldCannIV->insertBefore(L->getHeader()->getFirstInsertionPt());
     }
   }
-  else if (ExpandBECount && ReuseIVForExit && needsLFTR(L, DT)) {
+  else if (!EnableIVRewrite && ExpandBECount && needsLFTR(L, DT)) {
     IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD);
   }
   // If we have a trip count expression, rewrite the loop's exit condition
@@ -2143,7 +1885,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
         LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter);
   }
   // Rewrite IV-derived expressions.
-  if (!DisableIVRewrite)
+  if (EnableIVRewrite)
     RewriteIVExpressions(L, Rewriter);
 
   // Clear the rewriter cache, because values that are in the rewriter's cache
@@ -2180,7 +1922,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Verify that LFTR, and any other change have not interfered with SCEV's
   // ability to compute trip count.
 #ifndef NDEBUG
-  if (DisableIVRewrite && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+  if (!EnableIVRewrite && VerifyIndvars &&
+      !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
     SE->forgetLoop(L);
     const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
     if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index b500d5b..f410af3 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -811,8 +811,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
 /// important optimization that encourages jump threading, and needs to be run
 /// interlaced with other jump threading tasks.
 bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
-  // Don't hack volatile loads.
-  if (LI->isVolatile()) return false;
+  // Don't hack volatile/atomic loads.
+  if (!LI->isSimple()) return false;
 
   // If the load is defined in a block with exactly one predecessor, it can't be
   // partially redundant.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 66add6c..b79bb13 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -151,6 +151,11 @@ namespace {
     ///
     bool isSafeToExecuteUnconditionally(Instruction &I);
 
+    /// isGuaranteedToExecute - Check that the instruction is guaranteed to
+    /// execute.
+    ///
+    bool isGuaranteedToExecute(Instruction &I);
+
     /// pointerInvalidatedByLoop - Return true if the body of this loop may
     /// store into the memory location pointed to by V.
     ///
@@ -357,8 +362,8 @@ void LICM::HoistRegion(DomTreeNode *N) {
 bool LICM::canSinkOrHoistInst(Instruction &I) {
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-    if (LI->isVolatile())
-      return false;        // Don't hoist volatile loads!
+    if (!LI->isUnordered())
+      return false;        // Don't hoist volatile/atomic loads!
 
     // Loads from constant memory are always safe to move, even if they end up
     // in the same alias set as something that ends up being modified.
@@ -461,7 +466,7 @@ void LICM::sink(Instruction &I) {
     } else {
       // Move the instruction to the start of the exit block, after any PHI
       // nodes in it.
-      I.moveBefore(ExitBlocks[0]->getFirstNonPHI());
+      I.moveBefore(ExitBlocks[0]->getFirstInsertionPt());
 
       // This instruction is no longer in the AST for the current loop, because
       // we just sunk it out of the loop.  If we just sunk it into an outer
@@ -504,7 +509,7 @@ void LICM::sink(Instruction &I) {
       continue;
 
     // Insert the code after the last PHI node.
-    BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI();
+    BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
 
     // If this is the first exit block processed, just move the original
     // instruction, otherwise clone the original instruction and insert
@@ -577,6 +582,10 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
   if (Inst.isSafeToSpeculativelyExecute())
     return true;
 
+  return isGuaranteedToExecute(Inst);
+}
+
+bool LICM::isGuaranteedToExecute(Instruction &Inst) {
   // Otherwise we have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.
@@ -635,7 +644,7 @@ namespace {
       for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
         BasicBlock *ExitBlock = LoopExitBlocks[i];
         Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
-        Instruction *InsertPos = ExitBlock->getFirstNonPHI();
+        Instruction *InsertPos = ExitBlock->getFirstInsertionPt();
         StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos);
         NewSI->setAlignment(Alignment);
         NewSI->setDebugLoc(DL);
@@ -713,34 +722,41 @@ void LICM::PromoteAliasSet(AliasSet &AS) {
 
       // If there is an non-load/store instruction in the loop, we can't promote
       // it.
-      unsigned InstAlignment;
       if (LoadInst *load = dyn_cast<LoadInst>(Use)) {
-        assert(!cast<LoadInst>(Use)->isVolatile() && "AST broken");
-        InstAlignment = load->getAlignment();
+        assert(!load->isVolatile() && "AST broken");
+        if (!load->isSimple())
+          return;
       } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
         if (Use->getOperand(1) != ASIV)
           continue;
-        InstAlignment = store->getAlignment();
-        assert(!cast<StoreInst>(Use)->isVolatile() && "AST broken");
+        assert(!store->isVolatile() && "AST broken");
+        if (!store->isSimple())
+          return;
+
+        // Note that we only check GuaranteedToExecute inside the store case
+        // so that we do not introduce stores where they did not exist before
+        // (which would break the LLVM concurrency model).
+
+        // If the alignment of this instruction allows us to specify a more
+        // restrictive (and performant) alignment and if we are sure this
+        // instruction will be executed, update the alignment.
+        // Larger is better, with the exception of 0 being the best alignment.
+        unsigned InstAlignment = store->getAlignment();
+        if ((InstAlignment > Alignment || InstAlignment == 0)
+            && (Alignment != 0))
+          if (isGuaranteedToExecute(*Use)) {
+            GuaranteedToExecute = true;
+            Alignment = InstAlignment;
+          }
+
+        if (!GuaranteedToExecute)
+          GuaranteedToExecute = isGuaranteedToExecute(*Use);
+
       } else
         return; // Not a load or store.
 
-      // If the alignment of this instruction allows us to specify a more
-      // restrictive (and performant) alignment and if we are sure this
-      // instruction will be executed, update the alignment.
-      // Larger is better, with the exception of 0 being the best alignment.
-      if ((InstAlignment > Alignment || InstAlignment == 0)
-          && (Alignment != 0))
-        if (isSafeToExecuteUnconditionally(*Use)) {
-          GuaranteedToExecute = true;
-          Alignment = InstAlignment;
-        }
-
-      if (!GuaranteedToExecute)
-        GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use);
-
       LoopUses.push_back(Use);
     }
   }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index ea4c515..ad15cbb 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -267,7 +267,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
 
 /// processLoopStore - See if this store can be promoted to a memset or memcpy.
 bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
-  if (SI->isVolatile()) return false;
+  if (!SI->isSimple()) return false;
 
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
@@ -314,7 +314,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
     const SCEVAddRecExpr *LoadEv =
       dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0)));
     if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() &&
-        StoreEv->getOperand(1) == LoadEv->getOperand(1) && !LI->isVolatile())
+        StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple())
       if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount))
         return true;
   }
@@ -463,7 +463,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
     SplatValue = 0;
   } else {
     // Otherwise, this isn't an idiom we can transform.  For example, we can't
-    // do anything with a 3-byte store, for example.
+    // do anything with a 3-byte store.
     return false;
   }
 
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e90b5bc..3e122c2 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -70,12 +70,27 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include <algorithm>
 using namespace llvm;
 
+namespace llvm {
+cl::opt<bool> EnableNested(
+  "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
+
+cl::opt<bool> EnableRetry(
+    "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+cl::opt<bool> EnablePhiElim(
+    "enable-lsr-phielim", cl::Hidden, cl::desc("Enable LSR phi elimination"));
+}
+
 namespace {
 
 /// RegSortData - This class holds data which is used to order reuse candidates.
@@ -670,6 +685,21 @@ public:
 
   void Loose();
 
+#ifndef NDEBUG
+  // Once any of the metrics loses, they must all remain losers.
+  bool isValid() {
+    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+             | ImmCost | SetupCost) != ~0u)
+      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+           & ImmCost & SetupCost) == ~0u);
+  }
+#endif
+
+  bool isLoser() {
+    assert(isValid() && "invalid cost");
+    return NumRegs == ~0u;
+  }
+
   void RateFormula(const Formula &F,
                    SmallPtrSet<const SCEV *, 16> &Regs,
                    const DenseSet<const SCEV *> &VisitedRegs,
@@ -702,34 +732,48 @@ void Cost::RateRegister(const SCEV *Reg,
     if (AR->getLoop() == L)
       AddRecCost += 1; /// TODO: This should be a function of the stride.
 
-    // If this is an addrec for a loop that's already been visited by LSR,
-    // don't second-guess its addrec phi nodes. LSR isn't currently smart
-    // enough to reason about more than one loop at a time. Consider these
-    // registers free and leave them alone.
-    else if (L->contains(AR->getLoop()) ||
+    // If this is an addrec for another loop, don't second-guess its addrec phi
+    // nodes. LSR isn't currently smart enough to reason about more than one
+    // loop at a time. LSR has either already run on inner loops, will not run
+    // on other loops, and cannot be expected to change sibling loops. If the
+    // AddRec exists, consider it's register free and leave it alone. Otherwise,
+    // do not consider this formula at all.
+    // FIXME: why do we need to generate such fomulae?
+    else if (!EnableNested || L->contains(AR->getLoop()) ||
              (!AR->getLoop()->contains(L) &&
               DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
       for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
-           PHINode *PN = dyn_cast<PHINode>(I); ++I)
+           PHINode *PN = dyn_cast<PHINode>(I); ++I) {
         if (SE.isSCEVable(PN->getType()) &&
             (SE.getEffectiveSCEVType(PN->getType()) ==
              SE.getEffectiveSCEVType(AR->getType())) &&
             SE.getSCEV(PN) == AR)
           return;
-
+      }
+      if (!EnableNested) {
+        Loose();
+        return;
+      }
       // If this isn't one of the addrecs that the loop already has, it
       // would require a costly new phi and add. TODO: This isn't
       // precisely modeled right now.
       ++NumBaseAdds;
-      if (!Regs.count(AR->getStart()))
+      if (!Regs.count(AR->getStart())) {
         RateRegister(AR->getStart(), Regs, L, SE, DT);
+        if (isLoser())
+          return;
+      }
     }
 
     // Add the step value register, if it needs one.
     // TODO: The non-affine case isn't precisely modeled here.
-    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
-      if (!Regs.count(AR->getStart()))
+    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+      if (!Regs.count(AR->getOperand(1))) {
         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+        if (isLoser())
+          return;
+      }
+    }
   }
   ++NumRegs;
 
@@ -769,6 +813,8 @@ void Cost::RateFormula(const Formula &F,
       return;
     }
     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
+    if (isLoser())
+      return;
   }
   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
        E = F.BaseRegs.end(); I != E; ++I) {
@@ -778,6 +824,8 @@ void Cost::RateFormula(const Formula &F,
       return;
     }
     RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
+    if (isLoser())
+      return;
   }
 
   // Determine how many (unfolded) adds we'll need inside the loop.
@@ -795,6 +843,7 @@ void Cost::RateFormula(const Formula &F,
     else if (Offset != 0)
       ImmCost += APInt(64, Offset, true).getMinSignedBits();
   }
+  assert(isValid() && "invalid cost");
 }
 
 /// Loose - Set this cost to a losing value.
@@ -1156,7 +1205,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
     if (AM.BaseOffs != 0) {
-      if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
+      if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs);
       return false;
     }
 
@@ -1427,6 +1476,7 @@ void LSRInstance::OptimizeShadowIV() {
     ++UI;
     Instruction *ShadowUse = CandidateUI->getUser();
     Type *DestTy = NULL;
+    bool IsSigned = false;
 
     /* If shadow use is a int->float cast then insert a second IV
        to eliminate this cast.
@@ -1440,10 +1490,14 @@ void LSRInstance::OptimizeShadowIV() {
          for (unsigned i = 0; i < n; ++i, ++d)
            foo(d);
     */
-    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
+    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = false;
       DestTy = UCast->getDestTy();
-    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
+    }
+    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = true;
       DestTy = SCast->getDestTy();
+    }
     if (!DestTy) continue;
 
     if (TLI) {
@@ -1474,7 +1528,9 @@ void LSRInstance::OptimizeShadowIV() {
 
     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
     if (!Init) continue;
-    Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
+    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+                                        (double)Init->getSExtValue() :
+                                        (double)Init->getZExtValue());
 
     BinaryOperator *Incr =
       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
@@ -3275,6 +3331,9 @@ retry:
   skip:;
   }
 
+  if (!EnableRetry && !AnySatisfiedReqRegs)
+    return;
+
   // If none of the formulae had all of the required registers, relax the
   // constraint so that we don't exclude all formulae.
   if (!AnySatisfiedReqRegs) {
@@ -3298,6 +3357,10 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   // SolveRecurse does all the work.
   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
                CurRegs, VisitedRegs);
+  if (Solution.empty()) {
+    DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+    return;
+  }
 
   // Ok, we've now made all our decisions.
   DEBUG(dbgs() << "\n"
@@ -3416,6 +3479,9 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
   // Don't insert instructions before PHI nodes.
   while (isa<PHINode>(IP)) ++IP;
 
+  // Ignore landingpad instructions.
+  while (isa<LandingPadInst>(IP)) ++IP;
+
   // Ignore debug intrinsics.
   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
 
@@ -3527,7 +3593,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // The other interesting way of "folding" with an ICmpZero is to use a
       // negated immediate.
       if (!ICmpScaledV)
-        ICmpScaledV = ConstantInt::get(IntTy, -Offset);
+        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
         ICmpScaledV = ConstantInt::get(IntTy, Offset);
@@ -3611,10 +3677,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
       // users.
       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
           !isa<IndirectBrInst>(BB->getTerminator())) {
-        Loop *PNLoop = LI.getLoopFor(PN->getParent());
-        if (!PNLoop || PN->getParent() != PNLoop->getHeader()) {
+        BasicBlock *Parent = PN->getParent();
+        Loop *PNLoop = LI.getLoopFor(Parent);
+        if (!PNLoop || Parent != PNLoop->getHeader()) {
           // Split the critical edge.
-          BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
+          BasicBlock *NewBB = 0;
+          if (!Parent->isLandingPad()) {
+            NewBB = SplitCriticalEdge(BB, Parent, P,
+                                      /*MergeIdenticalEdges=*/true,
+                                      /*DontDeleteUselessPhis=*/true);
+          } else {
+            SmallVector<BasicBlock*, 2> NewBBs;
+            SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
+            NewBB = NewBBs[0];
+          }
 
           // If PN is outside of the loop and BB is in the loop, we want to
           // move the block to be immediately before the PHI block, not
@@ -3700,6 +3776,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
 
   SCEVExpander Rewriter(SE, "lsr");
   Rewriter.disableCanonicalMode();
+  Rewriter.enableLSRMode();
   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
 
   // Expand the new value definitions and update the users.
@@ -3740,6 +3817,23 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   OptimizeShadowIV();
   OptimizeLoopTermCond();
 
+  // If loop preparation eliminates all interesting IV users, bail.
+  if (IU.empty()) return;
+
+  // Skip nested loops until we can model them better with formulae.
+  if (!EnableNested && !L->empty()) {
+
+    if (EnablePhiElim) {
+      // Remove any extra phis created by processing inner loops.
+      SmallVector<WeakVH, 16> DeadInsts;
+      SCEVExpander Rewriter(SE, "lsr");
+      Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
+      Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+    }
+    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+    return;
+  }
+
   // Start collecting data and preparing for the solver.
   CollectInterestingTypesAndFactors();
   CollectFixupsAndInitialFormulae();
@@ -3763,6 +3857,9 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   Types.clear();
   RegUses.clear();
 
+  if (Solution.empty())
+    return;
+
 #ifndef NDEBUG
   // Formulae should be legal.
   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
@@ -3778,6 +3875,14 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
 
   // Now that we've decided what we want, make it so.
   ImplementSolution(Solution, P);
+
+  if (EnablePhiElim) {
+    // Remove any extra phis created by processing inner loops.
+    SmallVector<WeakVH, 16> DeadInsts;
+    SCEVExpander Rewriter(SE, "lsr");
+    Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
+    Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+  }
 }
 
 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index fef6bc3..91395b2 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Target/TargetData.h"
 #include <climits>
 
 using namespace llvm;
@@ -39,6 +40,11 @@ UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
   cl::desc("Allows loops to be partially unrolled until "
            "-unroll-threshold loop size is reached."));
 
+// Temporary flag to be removed in 3.0
+static cl::opt<bool>
+NoSCEVUnroll("disable-unroll-scev", cl::init(false), cl::Hidden,
+  cl::desc("Use ScalarEvolution to analyze loop trip counts for unrolling"));
+
 namespace {
   class LoopUnroll : public LoopPass {
   public:
@@ -49,7 +55,7 @@ namespace {
       CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
 
       UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
-     
+
       initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
     }
 
@@ -57,11 +63,11 @@ namespace {
     /// that the loop unroll should be performed regardless of how much
     /// code expansion would result.
     static const unsigned NoThreshold = UINT_MAX;
-    
+
     // Threshold to use when optsize is specified (and there is no
     // explicit -unroll-threshold).
     static const unsigned OptSizeUnrollThreshold = 50;
-    
+
     unsigned CurrentCount;
     unsigned CurrentThreshold;
     bool     CurrentAllowPartial;
@@ -79,6 +85,7 @@ namespace {
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
+      AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
@@ -101,45 +108,62 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
-static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls) {
+static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+                                    const TargetData *TD) {
   CodeMetrics Metrics;
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
-    Metrics.analyzeBasicBlock(*I);
+    Metrics.analyzeBasicBlock(*I, TD);
   NumCalls = Metrics.NumInlineCandidates;
-  
+
   unsigned LoopSize = Metrics.NumInsts;
-  
+
   // Don't allow an estimate of size zero.  This would allows unrolling of loops
   // with huge iteration counts, which is a compile time problem even if it's
   // not a problem for code quality.
   if (LoopSize == 0) LoopSize = 1;
-  
+
   return LoopSize;
 }
 
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   LoopInfo *LI = &getAnalysis<LoopInfo>();
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
         << "] Loop %" << Header->getName() << "\n");
   (void)Header;
-  
+
   // Determine the current unrolling threshold.  While this is normally set
   // from UnrollThreshold, it is overridden to a smaller value if the current
   // function is marked as optimize-for-size, and the unroll threshold was
   // not user specified.
   unsigned Threshold = CurrentThreshold;
-  if (!UserThreshold && 
+  if (!UserThreshold &&
       Header->getParent()->hasFnAttr(Attribute::OptimizeForSize))
     Threshold = OptSizeUnrollThreshold;
 
-  // Find trip count
-  unsigned TripCount = L->getSmallConstantTripCount();
-  unsigned Count = CurrentCount;
-
+  // Find trip count and trip multiple if count is not available
+  unsigned TripCount = 0;
+  unsigned TripMultiple = 1;
+  if (!NoSCEVUnroll) {
+    // Find "latch trip count". UnrollLoop assumes that control cannot exit
+    // via the loop latch on any iteration prior to TripCount. The loop may exit
+    // early via an earlier branch.
+    BasicBlock *LatchBlock = L->getLoopLatch();
+    if (LatchBlock) {
+      TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
+      TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
+    }
+  }
+  else {
+    TripCount = L->getSmallConstantTripCount();
+    if (TripCount == 0)
+      TripMultiple = L->getSmallConstantTripMultiple();
+  }
   // Automatically select an unroll count.
+  unsigned Count = CurrentCount;
   if (Count == 0) {
     // Conservative heuristic: if we know the trip count, see if we can
     // completely unroll (subject to the threshold, checked below); otherwise
@@ -152,8 +176,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Enforce the threshold.
   if (Threshold != NoThreshold) {
+    const TargetData *TD = getAnalysisIfAvailable<TargetData>();
     unsigned NumInlineCandidates;
-    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates);
+    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD);
     DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
     if (NumInlineCandidates != 0) {
       DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
@@ -182,12 +207,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  Function *F = L->getHeader()->getParent();
-  if (!UnrollLoop(L, Count, LI, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, TripMultiple, LI, &LPM))
     return false;
 
-  // FIXME: Reconstruct dom info, because it is not preserved properly.
-  if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
-    DT->runOnFunction(*F);
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 840c4b6..458949c 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -492,7 +492,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
   Value *BranchVal = LIC;
   if (!isa<ConstantInt>(Val) ||
       Val->getType() != Type::getInt1Ty(LIC->getContext()))
-    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val, "tmp");
+    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val);
   else if (Val != ConstantInt::getTrue(Val->getContext()))
     // We want to enter the new loop when the condition is true.
     std::swap(TrueDest, FalseDest);
@@ -561,10 +561,17 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
     BasicBlock *ExitBlock = ExitBlocks[i];
     SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
                                        pred_end(ExitBlock));
+
     // Although SplitBlockPredecessors doesn't preserve loop-simplify in
     // general, if we call it on all predecessors of all exits then it does.
-    SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(),
-                           ".us-lcssa", this);
+    if (!ExitBlock->isLandingPad()) {
+      SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(),
+                             ".us-lcssa", this);
+    } else {
+      SmallVector<BasicBlock*, 2> NewBBs;
+      SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa",
+                                  this, NewBBs);
+    }
   }
 }
 
@@ -632,7 +639,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     // as well.
     ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase());
   }
-  
+
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
     BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
     // The new exit block should be in the same loop as the old one.
@@ -653,6 +660,19 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
       if (It != VMap.end()) V = It->second;
       PN->addIncoming(V, NewExit);
     }
+
+    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+      PN = PHINode::Create(LPad->getType(), 0, "",
+                           ExitSucc->getFirstInsertionPt());
+
+      for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        LandingPadInst *LPI = BB->getLandingPadInst();
+        LPI->replaceAllUsesWith(PN);
+        PN->addIncoming(LPI, BB);
+      }
+    }
   }
 
   // Rewrite the code to refer to itself.
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 9087b46..689bbe9 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -20,98 +20,88 @@
 #include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
-static bool LowerAtomicIntrinsic(IntrinsicInst *II) {
-  IRBuilder<> Builder(II->getParent(), II);
-  unsigned IID = II->getIntrinsicID();
-  switch (IID) {
-  case Intrinsic::memory_barrier:
-    break;
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+  IRBuilder<> Builder(CXI->getParent(), CXI);
+  Value *Ptr = CXI->getPointerOperand();
+  Value *Cmp = CXI->getCompareOperand();
+  Value *Val = CXI->getNewValOperand();
+ 
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+  Builder.CreateStore(Res, Ptr);
+ 
+  CXI->replaceAllUsesWith(Orig);
+  CXI->eraseFromParent();
+  return true;
+}
 
-  case Intrinsic::atomic_load_add:
-  case Intrinsic::atomic_load_sub:
-  case Intrinsic::atomic_load_and:
-  case Intrinsic::atomic_load_nand:
-  case Intrinsic::atomic_load_or:
-  case Intrinsic::atomic_load_xor:
-  case Intrinsic::atomic_load_max:
-  case Intrinsic::atomic_load_min:
-  case Intrinsic::atomic_load_umax:
-  case Intrinsic::atomic_load_umin: {
-    Value *Ptr = II->getArgOperand(0), *Delta = II->getArgOperand(1);
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+  IRBuilder<> Builder(RMWI->getParent(), RMWI);
+  Value *Ptr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
 
-    LoadInst *Orig = Builder.CreateLoad(Ptr);
-    Value *Res = NULL;
-    switch (IID) {
-    default: assert(0 && "Unrecognized atomic modify operation");
-    case Intrinsic::atomic_load_add:
-      Res = Builder.CreateAdd(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_sub:
-      Res = Builder.CreateSub(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_and:
-      Res = Builder.CreateAnd(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_nand:
-      Res = Builder.CreateNot(Builder.CreateAnd(Orig, Delta));
-      break;
-    case Intrinsic::atomic_load_or:
-      Res = Builder.CreateOr(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_xor:
-      Res = Builder.CreateXor(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_max:
-      Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta),
-                                 Delta, Orig);
-      break;
-    case Intrinsic::atomic_load_min:
-      Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta),
-                                 Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_umax:
-      Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta),
-                                 Delta, Orig);
-      break;
-    case Intrinsic::atomic_load_umin:
-      Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta),
-                                 Orig, Delta);
-      break;
-    }
-    Builder.CreateStore(Res, Ptr);
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Res = NULL;
 
-    II->replaceAllUsesWith(Orig);
+  switch (RMWI->getOperation()) {
+  default: llvm_unreachable("Unexpected RMW operation");
+  case AtomicRMWInst::Xchg:
+    Res = Val;
     break;
-  }
-
-  case Intrinsic::atomic_swap: {
-    Value *Ptr = II->getArgOperand(0), *Val = II->getArgOperand(1);
-    LoadInst *Orig = Builder.CreateLoad(Ptr);
-    Builder.CreateStore(Val, Ptr);
-    II->replaceAllUsesWith(Orig);
+  case AtomicRMWInst::Add:
+    Res = Builder.CreateAdd(Orig, Val);
     break;
-  }
-
-  case Intrinsic::atomic_cmp_swap: {
-    Value *Ptr = II->getArgOperand(0), *Cmp = II->getArgOperand(1);
-    Value *Val = II->getArgOperand(2);
-
-    LoadInst *Orig = Builder.CreateLoad(Ptr);
-    Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
-    Value *Res = Builder.CreateSelect(Equal, Val, Orig);
-    Builder.CreateStore(Res, Ptr);
-    II->replaceAllUsesWith(Orig);
+  case AtomicRMWInst::Sub:
+    Res = Builder.CreateSub(Orig, Val);
+    break;
+  case AtomicRMWInst::And:
+    Res = Builder.CreateAnd(Orig, Val);
+    break;
+  case AtomicRMWInst::Nand:
+    Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+    break;
+  case AtomicRMWInst::Or:
+    Res = Builder.CreateOr(Orig, Val);
+    break;
+  case AtomicRMWInst::Xor:
+    Res = Builder.CreateXor(Orig, Val);
+    break;
+  case AtomicRMWInst::Max:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::Min:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Orig, Val);
+    break;
+  case AtomicRMWInst::UMax:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::UMin:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Orig, Val);
     break;
   }
+  Builder.CreateStore(Res, Ptr);
+  RMWI->replaceAllUsesWith(Orig);
+  RMWI->eraseFromParent();
+  return true;
+}
 
-  default:
-    return false;
-  }
+static bool LowerFenceInst(FenceInst *FI) {
+  FI->eraseFromParent();
+  return true;
+}
 
-  assert(II->use_empty() &&
-         "Lowering should have eliminated any uses of the intrinsic call!");
-  II->eraseFromParent();
+static bool LowerLoadInst(LoadInst *LI) {
+  LI->setAtomic(NotAtomic);
+  return true;
+}
 
+static bool LowerStoreInst(StoreInst *SI) {
+  SI->setAtomic(NotAtomic);
   return true;
 }
 
@@ -123,9 +113,22 @@ namespace {
     }
     bool runOnBasicBlock(BasicBlock &BB) {
       bool Changed = false;
-      for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; )
-        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DI++))
-          Changed |= LowerAtomicIntrinsic(II);
+      for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
+        Instruction *Inst = DI++;
+        if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+          Changed |= LowerFenceInst(FI);
+        else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+          Changed |= LowerAtomicCmpXchgInst(CXI);
+        else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+          Changed |= LowerAtomicRMWInst(RMWI);
+        else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+          if (LI->isAtomic())
+            LowerLoadInst(LI);
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+          if (SI->isAtomic())
+            LowerStoreInst(SI);
+        }
+      }
       return Changed;
     }
   };
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index ba5ee68..298d692 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -384,7 +384,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     
     if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
       // If this is a store, see if we can merge it in.
-      if (NextStore->isVolatile()) break;
+      if (!NextStore->isSimple()) break;
     
       // Check to see if this stored value is of the same byte-splattable value.
       if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
@@ -479,7 +479,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 
 
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
-  if (SI->isVolatile()) return false;
+  if (!SI->isSimple()) return false;
   
   if (TD == 0) return false;
 
@@ -487,7 +487,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   // happen to be using a load-store pair to implement it, rather than
   // a memcpy.
   if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
-    if (!LI->isVolatile() && LI->hasOneUse() &&
+    if (LI->isSimple() && LI->hasOneUse() &&
         LI->getParent() == SI->getParent()) {
       MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = 0;
@@ -806,21 +806,26 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
   //   a) memcpy-memcpy xform which exposes redundance for DSE.
   //   b) call-memcpy xform for return slot optimization.
   MemDepResult DepInfo = MD->getDependency(M);
-  if (!DepInfo.isClobber())
-    return false;
-  
-  if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()))
-    return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue());
-    
-  if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
-    if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
-                             CopySize->getZExtValue(), C)) {
-      MD->removeInstruction(M);
-      M->eraseFromParent();
-      return true;
+  if (DepInfo.isClobber()) {
+    if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+      if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
+                               CopySize->getZExtValue(), C)) {
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        return true;
+      }
     }
   }
   
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  AliasAnalysis::Location SrcLoc = AA.getLocationForSource(M);
+  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true,
+                                                         M, M->getParent());
+  if (SrcDepInfo.isClobber()) {
+    if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+      return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue());
+  }
+
   return false;
 }
 
@@ -860,7 +865,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
 
   // Find out what feeds this byval argument.
   Value *ByValArg = CS.getArgument(ArgNo);
-  Type *ByValTy =cast<PointerType>(ByValArg->getType())->getElementType();
+  Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
   uint64_t ByValSize = TD->getTypeAllocSize(ByValTy);
   MemDepResult DepInfo =
     MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize),
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index f2e5ff9..80f5f01 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -344,6 +344,10 @@ static InstructionClass GetInstructionClass(const Value *V) {
       break;
     default:
       // For anything else, check all the operands.
+      // Note that this includes both operands of a Store: while the first
+      // operand isn't actually being dereferenced, it is being stored to
+      // memory where we can no longer track who might read it and dereference
+      // it, so we have to consider it potentially used.
       for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
            OI != OE; ++OI)
         if (IsPotentialUse(*OI))
@@ -421,9 +425,10 @@ static bool IsAlwaysTail(InstructionClass Class) {
 /// IsNoThrow - Test if the given class represents instructions which are always
 /// safe to mark with the nounwind attribute..
 static bool IsNoThrow(InstructionClass Class) {
+  // objc_retainBlock is not nounwind because it calls user copy constructors
+  // which could theoretically throw.
   return Class == IC_Retain ||
          Class == IC_RetainRV ||
-         Class == IC_RetainBlock ||
          Class == IC_Release ||
          Class == IC_Autorelease ||
          Class == IC_AutoreleaseRV ||
@@ -515,6 +520,10 @@ static bool IsObjCIdentifiedObject(const Value *V) {
     const Value *Pointer =
       StripPointerCastsAndObjCCalls(LI->getPointerOperand());
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
+      // A constant pointer can't be pointing to an object on the heap. It may
+      // be reference-counted, but it won't be deleted.
+      if (GV->isConstant())
+        return true;
       StringRef Name = GV->getName();
       // These special variables are known to hold values which are not
       // reference-counted pointers.
@@ -738,7 +747,6 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
   switch (GetBasicInstructionClass(CS.getInstruction())) {
   case IC_Retain:
   case IC_RetainRV:
-  case IC_RetainBlock:
   case IC_Autorelease:
   case IC_AutoreleaseRV:
   case IC_NoopCast:
@@ -746,6 +754,8 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
   case IC_FusedRetainAutorelease:
   case IC_FusedRetainAutoreleaseRV:
     // These functions don't access any memory visible to the compiler.
+    // Note that this doesn't include objc_retainBlock, becuase it updates
+    // pointers when it copies block data.
     return NoModRef;
   default:
     break;
@@ -877,7 +887,9 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 // usually can't sink them past other calls, which would be the main
 // case where it would be useful.
 
-/// TODO: The pointer returned from objc_loadWeakRetained is retained.
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
 
 #include "llvm/GlobalAlias.h"
 #include "llvm/Constants.h"
@@ -1098,16 +1110,16 @@ static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
   if (A == S_None || B == S_None)
     return S_None;
 
-  // Note that we can't merge S_CanRelease and S_Use.
   if (A > B) std::swap(A, B);
   if (TopDown) {
     // Choose the side which is further along in the sequence.
-    if (A == S_Retain && (B == S_CanRelease || B == S_Use))
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
       return B;
   } else {
     // Choose the side which is further along in the sequence.
     if ((A == S_Use || A == S_CanRelease) &&
-        (B == S_Release || B == S_Stop || B == S_MovableRelease))
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
       return A;
     // If both sides are releases, choose the more conservative one.
     if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
@@ -1124,22 +1136,37 @@ namespace {
   /// retain-decrement-use-release sequence or release-use-decrement-retain
   /// reverese sequence.
   struct RRInfo {
-    /// KnownIncremented - After an objc_retain, the reference count of the
-    /// referenced object is known to be positive. Similarly, before an
-    /// objc_release, the reference count of the referenced object is known to
-    /// be positive. If there are retain-release pairs in code regions where the
-    /// retain count is known to be positive, they can be eliminated, regardless
-    /// of any side effects between them.
-    bool KnownIncremented;
+    /// KnownSafe - After an objc_retain, the reference count of the referenced
+    /// object is known to be positive. Similarly, before an objc_release, the
+    /// reference count of the referenced object is known to be positive. If
+    /// there are retain-release pairs in code regions where the retain count
+    /// is known to be positive, they can be eliminated, regardless of any side
+    /// effects between them.
+    ///
+    /// Also, a retain+release pair nested within another retain+release
+    /// pair all on the known same pointer value can be eliminated, regardless
+    /// of any intervening side effects.
+    ///
+    /// KnownSafe is true when either of these conditions is satisfied.
+    bool KnownSafe;
 
     /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as
     /// opposed to objc_retain calls).
     bool IsRetainBlock;
 
+    /// CopyOnEscape - True if this the Calls are objc_retainBlock calls
+    /// which all have the !clang.arc.copy_on_escape metadata.
+    bool CopyOnEscape;
+
     /// IsTailCallRelease - True of the objc_release calls are all marked
     /// with the "tail" keyword.
     bool IsTailCallRelease;
 
+    /// Partial - True of we've seen an opportunity for partial RR elimination,
+    /// such as pushing calls into a CFG triangle or into one side of a
+    /// CFG diamond.
+    bool Partial;
+
     /// ReleaseMetadata - If the Calls are objc_release calls and they all have
     /// a clang.imprecise_release tag, this is the metadata tag.
     MDNode *ReleaseMetadata;
@@ -1153,7 +1180,8 @@ namespace {
     SmallPtrSet<Instruction *, 2> ReverseInsertPts;
 
     RRInfo() :
-      KnownIncremented(false), IsRetainBlock(false), IsTailCallRelease(false),
+      KnownSafe(false), IsRetainBlock(false), CopyOnEscape(false),
+      IsTailCallRelease(false), Partial(false),
       ReleaseMetadata(0) {}
 
     void clear();
@@ -1161,9 +1189,11 @@ namespace {
 }
 
 void RRInfo::clear() {
-  KnownIncremented = false;
+  KnownSafe = false;
   IsRetainBlock = false;
+  CopyOnEscape = false;
   IsTailCallRelease = false;
+  Partial = false;
   ReleaseMetadata = 0;
   Calls.clear();
   ReverseInsertPts.clear();
@@ -1176,6 +1206,9 @@ namespace {
     /// RefCount - The known minimum number of reference count increments.
     unsigned RefCount;
 
+    /// NestCount - The known minimum level of retain+release nesting.
+    unsigned NestCount;
+
     /// Seq - The current position in the sequence.
     Sequence Seq;
 
@@ -1184,7 +1217,11 @@ namespace {
     /// TODO: Encapsulate this better.
     RRInfo RRI;
 
-    PtrState() : RefCount(0), Seq(S_None) {}
+    PtrState() : RefCount(0), NestCount(0), Seq(S_None) {}
+
+    void SetAtLeastOneRefCount()  {
+      if (RefCount == 0) RefCount = 1;
+    }
 
     void IncrementRefCount() {
       if (RefCount != UINT_MAX) ++RefCount;
@@ -1194,14 +1231,22 @@ namespace {
       if (RefCount != 0) --RefCount;
     }
 
-    void ClearRefCount() {
-      RefCount = 0;
-    }
-
     bool IsKnownIncremented() const {
       return RefCount > 0;
     }
 
+    void IncrementNestCount() {
+      if (NestCount != UINT_MAX) ++NestCount;
+    }
+
+    void DecrementNestCount() {
+      if (NestCount != 0) --NestCount;
+    }
+
+    bool IsKnownNested() const {
+      return NestCount > 0;
+    }
+
     void SetSeq(Sequence NewSeq) {
       Seq = NewSeq;
     }
@@ -1233,23 +1278,40 @@ void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
   RefCount = std::min(RefCount, Other.RefCount);
+  NestCount = std::min(NestCount, Other.NestCount);
 
   // We can't merge a plain objc_retain with an objc_retainBlock.
   if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
     Seq = S_None;
 
+  // If we're not in a sequence (anymore), drop all associated state.
   if (Seq == S_None) {
     RRI.clear();
+  } else if (RRI.Partial || Other.RRI.Partial) {
+    // If we're doing a merge on a path that's previously seen a partial
+    // merge, conservatively drop the sequence, to avoid doing partial
+    // RR elimination. If the branch predicates for the two merge differ,
+    // mixing them is unsafe.
+    Seq = S_None;
+    RRI.clear();
   } else {
     // Conservatively merge the ReleaseMetadata information.
     if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
       RRI.ReleaseMetadata = 0;
 
-    RRI.KnownIncremented = RRI.KnownIncremented && Other.RRI.KnownIncremented;
+    RRI.CopyOnEscape = RRI.CopyOnEscape && Other.RRI.CopyOnEscape;
+    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
     RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease;
     RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
-    RRI.ReverseInsertPts.insert(Other.RRI.ReverseInsertPts.begin(),
-                                Other.RRI.ReverseInsertPts.end());
+
+    // Merge the insert point sets. If there are any differences,
+    // that makes this a partial merge.
+    RRI.Partial = RRI.ReverseInsertPts.size() !=
+                  Other.RRI.ReverseInsertPts.size();
+    for (SmallPtrSet<Instruction *, 2>::const_iterator
+         I = Other.RRI.ReverseInsertPts.begin(),
+         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
+      RRI.Partial |= RRI.ReverseInsertPts.insert(*I);
   }
 }
 
@@ -1316,7 +1378,7 @@ namespace {
     }
 
     void clearBottomUpPointers() {
-      PerPtrTopDown.clear();
+      PerPtrBottomUp.clear();
     }
 
     void clearTopDownPointers() {
@@ -1334,6 +1396,12 @@ namespace {
     unsigned GetAllPathCount() const {
       return TopDownPathCount * BottomUpPathCount;
     }
+
+    /// IsVisitedTopDown - Test whether the block for this BBState has been
+    /// visited by the top-down portion of the algorithm.
+    bool isVisitedTopDown() const {
+      return TopDownPathCount != 0;
+    }
   };
 }
 
@@ -1364,7 +1432,7 @@ void BBState::MergePred(const BBState &Other) {
                              /*TopDown=*/true);
   }
 
-  // For each entry in our set, if the other set doens't have an entry with the
+  // For each entry in our set, if the other set doesn't have an entry with the
   // same key, force it to merge with an empty entry.
   for (ptr_iterator MI = top_down_ptr_begin(),
        ME = top_down_ptr_end(); MI != ME; ++MI)
@@ -1389,7 +1457,7 @@ void BBState::MergeSucc(const BBState &Other) {
                              /*TopDown=*/false);
   }
 
-  // For each entry in our set, if the other set doens't have an entry
+  // For each entry in our set, if the other set doesn't have an entry
   // with the same key, force it to merge with an empty entry.
   for (ptr_iterator MI = bottom_up_ptr_begin(),
        ME = bottom_up_ptr_end(); MI != ME; ++MI)
@@ -1406,15 +1474,11 @@ namespace {
     /// Run - A flag indicating whether this optimization pass should run.
     bool Run;
 
-    /// RetainFunc, RelaseFunc - Declarations for objc_retain,
-    /// objc_retainBlock, and objc_release.
-    Function *RetainFunc, *RetainBlockFunc, *RetainRVFunc, *ReleaseFunc;
-
     /// RetainRVCallee, etc. - Declarations for ObjC runtime
     /// functions, for use in creating calls to them. These are initialized
     /// lazily to avoid cluttering up the Module with unused declarations.
     Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee,
-             *RetainCallee, *AutoreleaseCallee;
+             *RetainCallee, *RetainBlockCallee, *AutoreleaseCallee;
 
     /// UsedInThisFunciton - Flags which determine whether each of the
     /// interesting runtine functions is in fact used in the current function.
@@ -1424,10 +1488,15 @@ namespace {
     /// metadata.
     unsigned ImpreciseReleaseMDKind;
 
+    /// CopyOnEscape - The Metadata Kind for clang.arc.copy_on_escape
+    /// metadata.
+    unsigned CopyOnEscapeMDKind;
+
     Constant *getRetainRVCallee(Module *M);
     Constant *getAutoreleaseRVCallee(Module *M);
     Constant *getReleaseCallee(Module *M);
     Constant *getRetainCallee(Module *M);
+    Constant *getRetainBlockCallee(Module *M);
     Constant *getAutoreleaseCallee(Module *M);
 
     void OptimizeRetainCall(Function &F, Instruction *Retain);
@@ -1452,11 +1521,13 @@ namespace {
     void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
                    MapVector<Value *, RRInfo> &Retains,
                    DenseMap<Value *, RRInfo> &Releases,
-                   SmallVectorImpl<Instruction *> &DeadInsts);
+                   SmallVectorImpl<Instruction *> &DeadInsts,
+                   Module *M);
 
     bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
                               MapVector<Value *, RRInfo> &Retains,
-                              DenseMap<Value *, RRInfo> &Releases);
+                              DenseMap<Value *, RRInfo> &Releases,
+                              Module *M);
 
     void OptimizeWeakCalls(Function &F);
 
@@ -1561,6 +1632,23 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) {
   return RetainCallee;
 }
 
+Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
+  if (!RetainBlockCallee) {
+    LLVMContext &C = M->getContext();
+    std::vector<Type *> Params;
+    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
+    AttrListPtr Attributes;
+    // objc_retainBlock is not nounwind because it calls user copy constructors
+    // which could theoretically throw.
+    RetainBlockCallee =
+      M->getOrInsertFunction(
+        "objc_retainBlock",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attributes);
+  }
+  return RetainBlockCallee;
+}
+
 Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   if (!AutoreleaseCallee) {
     LLVMContext &C = M->getContext();
@@ -1904,12 +1992,19 @@ void
 ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) {
   // Check for a return of the pointer value.
   const Value *Ptr = GetObjCArg(AutoreleaseRV);
-  for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
-       UI != UE; ++UI) {
-    const User *I = *UI;
-    if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
-      return;
-  }
+  SmallVector<const Value *, 2> Users;
+  Users.push_back(Ptr);
+  do {
+    Ptr = Users.pop_back_val();
+    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
+         UI != UE; ++UI) {
+      const User *I = *UI;
+      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
+        return;
+      if (isa<BitCastInst>(I))
+        Users.push_back(I);
+    }
+  } while (!Users.empty());
 
   Changed = true;
   ++NumPeeps;
@@ -2132,41 +2227,49 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
       bool SomeSuccHasSame = false;
       bool AllSuccsHaveSame = true;
-      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI)
-        switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) {
+      PtrState &S = MyStates.getPtrTopDownState(Arg);
+      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+        PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg);
+        switch (SuccS.GetSeq()) {
         case S_None:
-        case S_CanRelease:
-          MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
-          SomeSuccHasSame = false;
-          break;
+        case S_CanRelease: {
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            S.ClearSequenceProgress();
+          continue;
+        }
         case S_Use:
           SomeSuccHasSame = true;
           break;
         case S_Stop:
         case S_Release:
         case S_MovableRelease:
-          AllSuccsHaveSame = false;
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            AllSuccsHaveSame = false;
           break;
         case S_Retain:
           llvm_unreachable("bottom-up pointer in retain state!");
         }
+      }
       // If the state at the other end of any of the successor edges
       // matches the current state, require all edges to match. This
       // guards against loops in the middle of a sequence.
       if (SomeSuccHasSame && !AllSuccsHaveSame)
-        MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+        S.ClearSequenceProgress();
     }
     case S_CanRelease: {
       const Value *Arg = I->first;
       const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
       bool SomeSuccHasSame = false;
       bool AllSuccsHaveSame = true;
-      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI)
-        switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) {
-        case S_None:
-          MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
-          SomeSuccHasSame = false;
-          break;
+      PtrState &S = MyStates.getPtrTopDownState(Arg);
+      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+        PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg);
+        switch (SuccS.GetSeq()) {
+        case S_None: {
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            S.ClearSequenceProgress();
+          continue;
+        }
         case S_CanRelease:
           SomeSuccHasSame = true;
           break;
@@ -2174,16 +2277,18 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
         case S_Release:
         case S_MovableRelease:
         case S_Use:
-          AllSuccsHaveSame = false;
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            AllSuccsHaveSame = false;
           break;
         case S_Retain:
           llvm_unreachable("bottom-up pointer in retain state!");
         }
+      }
       // If the state at the other end of any of the successor edges
       // matches the current state, require all edges to match. This
       // guards against loops in the middle of a sequence.
       if (SomeSuccHasSame && !AllSuccsHaveSame)
-        MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+        S.ClearSequenceProgress();
     }
     }
 }
@@ -2207,6 +2312,8 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       if (Succ == BB)
         continue;
       DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+      // If we haven't seen this node yet, then we've found a CFG cycle.
+      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
       if (I == BBStates.end())
         continue;
       MyStates.InitFromSucc(I->second);
@@ -2245,11 +2352,12 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
       S.SetSeqToRelease(Inst->getMetadata(ImpreciseReleaseMDKind));
       S.RRI.clear();
-      S.RRI.KnownIncremented = S.IsKnownIncremented();
+      S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
       S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
       S.RRI.Calls.insert(Inst);
 
       S.IncrementRefCount();
+      S.IncrementNestCount();
       break;
     }
     case IC_RetainBlock:
@@ -2259,6 +2367,16 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
       PtrState &S = MyStates.getPtrBottomUpState(Arg);
       S.DecrementRefCount();
+      S.SetAtLeastOneRefCount();
+      S.DecrementNestCount();
+
+      // An non-copy-on-escape objc_retainBlock call with just a use still
+      // needs to be kept, because it may be copying a block from the stack
+      // to the heap.
+      if (Class == IC_RetainBlock &&
+          !Inst->getMetadata(CopyOnEscapeMDKind) &&
+          S.GetSeq() == S_Use)
+        S.SetSeq(S_CanRelease);
 
       switch (S.GetSeq()) {
       case S_Stop:
@@ -2272,6 +2390,8 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
         // better to let it remain as the first instruction after a call.
         if (Class != IC_RetainRV) {
           S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+          if (S.RRI.IsRetainBlock)
+            S.RRI.CopyOnEscape = !!Inst->getMetadata(CopyOnEscapeMDKind);
           Retains[Inst] = S.RRI;
         }
         S.ClearSequenceProgress();
@@ -2281,7 +2401,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       case S_Retain:
         llvm_unreachable("bottom-up pointer in retain state!");
       }
-      break;
+      continue;
     }
     case IC_AutoreleasepoolPop:
       // Conservatively, clear MyStates for all known pointers.
@@ -2305,26 +2425,22 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       PtrState &S = MI->second;
       Sequence Seq = S.GetSeq();
 
-      // Check for possible retains and releases.
+      // Check for possible releases.
       if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        // Check for a retain (we're going bottom-up here).
         S.DecrementRefCount();
-
-        // Check for a release.
-        if (!IsRetain(Class) && Class != IC_RetainBlock)
-          switch (Seq) {
-          case S_Use:
-            S.SetSeq(S_CanRelease);
-            continue;
-          case S_CanRelease:
-          case S_Release:
-          case S_MovableRelease:
-          case S_Stop:
-          case S_None:
-            break;
-          case S_Retain:
-            llvm_unreachable("bottom-up pointer in retain state!");
-          }
+        switch (Seq) {
+        case S_Use:
+          S.SetSeq(S_CanRelease);
+          continue;
+        case S_CanRelease:
+        case S_Release:
+        case S_MovableRelease:
+        case S_Stop:
+        case S_None:
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
       }
 
       // Check for possible direct uses.
@@ -2332,14 +2448,14 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       case S_Release:
       case S_MovableRelease:
         if (CanUse(Inst, Ptr, PA, Class)) {
-          S.RRI.ReverseInsertPts.clear();
+          assert(S.RRI.ReverseInsertPts.empty());
           S.RRI.ReverseInsertPts.insert(Inst);
           S.SetSeq(S_Use);
         } else if (Seq == S_Release &&
                    (Class == IC_User || Class == IC_CallOrUser)) {
           // Non-movable releases depend on any possible objc pointer use.
           S.SetSeq(S_Stop);
-          S.RRI.ReverseInsertPts.clear();
+          assert(S.RRI.ReverseInsertPts.empty());
           S.RRI.ReverseInsertPts.insert(Inst);
         }
         break;
@@ -2378,14 +2494,18 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
       if (Pred == BB)
         continue;
       DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
-      if (I == BBStates.end())
+      assert(I != BBStates.end());
+      // If we haven't seen this node yet, then we've found a CFG cycle.
+      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
+      if (!I->second.isVisitedTopDown())
         continue;
       MyStates.InitFromPred(I->second);
       while (PI != PE) {
         Pred = *PI++;
         if (Pred != BB) {
           I = BBStates.find(Pred);
-          if (I != BBStates.end())
+          assert(I != BBStates.end());
+          if (I->second.isVisitedTopDown())
             MyStates.MergePred(I->second);
         }
       }
@@ -2422,18 +2542,25 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
         S.SetSeq(S_Retain);
         S.RRI.clear();
         S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-        S.RRI.KnownIncremented = S.IsKnownIncremented();
+        if (S.RRI.IsRetainBlock)
+          S.RRI.CopyOnEscape = !!Inst->getMetadata(CopyOnEscapeMDKind);
+        // Don't check S.IsKnownIncremented() here because it's not
+        // sufficient.
+        S.RRI.KnownSafe = S.IsKnownNested();
         S.RRI.Calls.insert(Inst);
       }
 
+      S.SetAtLeastOneRefCount();
       S.IncrementRefCount();
-      break;
+      S.IncrementNestCount();
+      continue;
     }
     case IC_Release: {
       Arg = GetObjCArg(Inst);
 
       PtrState &S = MyStates.getPtrTopDownState(Arg);
       S.DecrementRefCount();
+      S.DecrementNestCount();
 
       switch (S.GetSeq()) {
       case S_Retain:
@@ -2478,16 +2605,12 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
       Sequence Seq = S.GetSeq();
 
       // Check for possible releases.
-      if (!IsRetain(Class) && Class != IC_RetainBlock &&
-          CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        // Check for a release.
+      if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
         S.DecrementRefCount();
-
-        // Check for a release.
         switch (Seq) {
         case S_Retain:
           S.SetSeq(S_CanRelease);
-          S.RRI.ReverseInsertPts.clear();
+          assert(S.RRI.ReverseInsertPts.empty());
           S.RRI.ReverseInsertPts.insert(Inst);
 
           // One call can't cause a transition from S_Retain to S_CanRelease
@@ -2511,8 +2634,19 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
         if (CanUse(Inst, Ptr, PA, Class))
           S.SetSeq(S_Use);
         break;
-      case S_Use:
       case S_Retain:
+        // A non-copy-on-scape objc_retainBlock call may be responsible for
+        // copying the block data from the stack to the heap. Model this by
+        // moving it straight from S_Retain to S_Use.
+        if (S.RRI.IsRetainBlock &&
+            !S.RRI.CopyOnEscape &&
+            CanUse(Inst, Ptr, PA, Class)) {
+          assert(S.RRI.ReverseInsertPts.empty());
+          S.RRI.ReverseInsertPts.insert(Inst);
+          S.SetSeq(S_Use);
+        }
+        break;
+      case S_Use:
       case S_None:
         break;
       case S_Stop:
@@ -2533,28 +2667,43 @@ ObjCARCOpt::Visit(Function &F,
                   DenseMap<const BasicBlock *, BBState> &BBStates,
                   MapVector<Value *, RRInfo> &Retains,
                   DenseMap<Value *, RRInfo> &Releases) {
-  // Use postorder for bottom-up, and reverse-postorder for top-down, because we
+  // Use reverse-postorder on the reverse CFG for bottom-up, because we
   // magically know that loops will be well behaved, i.e. they won't repeatedly
-  // call retain on a single pointer without doing a release.
+  // call retain on a single pointer without doing a release. We can't use
+  // ReversePostOrderTraversal here because we want to walk up from each
+  // function exit point.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+  SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> Stack;
+  SmallVector<BasicBlock *, 16> Order;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *BB = I;
+    if (BB->getTerminator()->getNumSuccessors() == 0)
+      Stack.push_back(std::make_pair(BB, pred_begin(BB)));
+  }
+  while (!Stack.empty()) {
+    pred_iterator End = pred_end(Stack.back().first);
+    while (Stack.back().second != End) {
+      BasicBlock *BB = *Stack.back().second++;
+      if (Visited.insert(BB))
+        Stack.push_back(std::make_pair(BB, pred_begin(BB)));
+    }
+    Order.push_back(Stack.pop_back_val().first);
+  }
   bool BottomUpNestingDetected = false;
-  SmallVector<BasicBlock *, 8> PostOrder;
-  for (po_iterator<Function *> I = po_begin(&F), E = po_end(&F); I != E; ++I) {
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+         Order.rbegin(), E = Order.rend(); I != E; ++I) {
     BasicBlock *BB = *I;
-    PostOrder.push_back(BB);
-
     BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
   }
 
-  // Iterate through the post-order in reverse order, achieving a
-  // reverse-postorder traversal. We don't use the ReversePostOrderTraversal
-  // class here because it works by computing its own full postorder iteration,
-  // recording the sequence, and playing it back in reverse. Since we're already
-  // doing a full iteration above, we can just record the sequence manually and
-  // avoid the cost of having ReversePostOrderTraversal compute it.
+  // Use regular reverse-postorder for top-down.
   bool TopDownNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator
-       RI = PostOrder.rbegin(), RE = PostOrder.rend(); RI != RE; ++RI)
-    TopDownNestingDetected |= VisitTopDown(*RI, BBStates, Releases);
+  typedef ReversePostOrderTraversal<Function *> RPOTType;
+  RPOTType RPOT(&F);
+  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases);
+  }
 
   return TopDownNestingDetected && BottomUpNestingDetected;
 }
@@ -2565,12 +2714,10 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
                            RRInfo &ReleasesToMove,
                            MapVector<Value *, RRInfo> &Retains,
                            DenseMap<Value *, RRInfo> &Releases,
-                           SmallVectorImpl<Instruction *> &DeadInsts) {
+                           SmallVectorImpl<Instruction *> &DeadInsts,
+                           Module *M) {
   Type *ArgTy = Arg->getType();
-  Type *ParamTy =
-    (RetainRVFunc ? RetainRVFunc :
-     RetainFunc ? RetainFunc :
-     RetainBlockFunc)->arg_begin()->getType();
+  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
 
   // Insert the new retain and release calls.
   for (SmallPtrSet<Instruction *, 2>::const_iterator
@@ -2581,9 +2728,12 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
     CallInst *Call =
       CallInst::Create(RetainsToMove.IsRetainBlock ?
-                         RetainBlockFunc : RetainFunc,
+                         getRetainBlockCallee(M) : getRetainCallee(M),
                        MyArg, "", InsertPt);
     Call->setDoesNotThrow();
+    if (RetainsToMove.CopyOnEscape)
+      Call->setMetadata(CopyOnEscapeMDKind,
+                        MDNode::get(M->getContext(), ArrayRef<Value *>()));
     if (!RetainsToMove.IsRetainBlock)
       Call->setTailCall();
   }
@@ -2598,8 +2748,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
       // The invoke's return value isn't available in the unwind block,
       // but our releases will never depend on it, because they must be
       // paired with retains from before the invoke.
-      InsertPts[0] = II->getNormalDest()->getFirstNonPHI();
-      InsertPts[1] = II->getUnwindDest()->getFirstNonPHI();
+      InsertPts[0] = II->getNormalDest()->getFirstInsertionPt();
+      InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt();
     } else {
       // Insert code immediately after the last use.
       InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse));
@@ -2609,7 +2759,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
       Instruction *InsertPt = *I;
       Value *MyArg = ArgTy == ParamTy ? Arg :
                      new BitCastInst(Arg, ParamTy, "", InsertPt);
-      CallInst *Call = CallInst::Create(ReleaseFunc, MyArg, "", InsertPt);
+      CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
+                                        "", InsertPt);
       // Attach a clang.imprecise_release metadata tag, if appropriate.
       if (MDNode *M = ReleasesToMove.ReleaseMetadata)
         Call->setMetadata(ImpreciseReleaseMDKind, M);
@@ -2640,7 +2791,8 @@ bool
 ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
                                    &BBStates,
                                  MapVector<Value *, RRInfo> &Retains,
-                                 DenseMap<Value *, RRInfo> &Releases) {
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 Module *M) {
   bool AnyPairsCompletelyEliminated = false;
   RRInfo RetainsToMove;
   RRInfo ReleasesToMove;
@@ -2649,21 +2801,37 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
   SmallVector<Instruction *, 8> DeadInsts;
 
   for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
-       E = Retains.end(); I != E; ) {
-    Value *V = (I++)->first;
+       E = Retains.end(); I != E; ++I) {
+    Value *V = I->first;
     if (!V) continue; // blotted
 
     Instruction *Retain = cast<Instruction>(V);
     Value *Arg = GetObjCArg(Retain);
 
-    // If the object being released is in static or stack storage, we know it's
+    // If the object being released is in static storage, we know it's
     // not being managed by ObjC reference counting, so we can delete pairs
     // regardless of what possible decrements or uses lie between them.
-    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+    bool KnownSafe = isa<Constant>(Arg);
+   
+    // Same for stack storage, unless this is a non-copy-on-escape
+    // objc_retainBlock call, which is responsible for copying the block data
+    // from the stack to the heap.
+    if ((!I->second.IsRetainBlock || I->second.CopyOnEscape) &&
+        isa<AllocaInst>(Arg))
+      KnownSafe = true;
+
+    // A constant pointer can't be pointing to an object on the heap. It may
+    // be reference-counted, but it won't be deleted.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+      if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(
+              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
+        if (GV->isConstant())
+          KnownSafe = true;
 
     // If a pair happens in a region where it is known that the reference count
     // is already incremented, we can similarly ignore possible decrements.
-    bool KnownIncrementedTD = true, KnownIncrementedBU = true;
+    bool KnownSafeTD = true, KnownSafeBU = true;
 
     // Connect the dots between the top-down-collected RetainsToMove and
     // bottom-up-collected ReleasesToMove to form sets of related calls.
@@ -2683,7 +2851,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
         MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
         assert(It != Retains.end());
         const RRInfo &NewRetainRRI = It->second;
-        KnownIncrementedTD &= NewRetainRRI.KnownIncremented;
+        KnownSafeTD &= NewRetainRRI.KnownSafe;
         for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewRetainRRI.Calls.begin(),
              LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
@@ -2739,7 +2907,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
           Releases.find(NewRelease);
         assert(It != Releases.end());
         const RRInfo &NewReleaseRRI = It->second;
-        KnownIncrementedBU &= NewReleaseRRI.KnownIncremented;
+        KnownSafeBU &= NewReleaseRRI.KnownSafe;
         for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewReleaseRRI.Calls.begin(),
              LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
@@ -2759,6 +2927,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
             // Merge the IsRetainBlock values.
             if (FirstRetain) {
               RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock;
+              RetainsToMove.CopyOnEscape = NewReleaseRetainRRI.CopyOnEscape;
               FirstRetain = false;
             } else if (ReleasesToMove.IsRetainBlock !=
                        NewReleaseRetainRRI.IsRetainBlock)
@@ -2766,6 +2935,9 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
               // objc_retain and the other uses objc_retainBlock.
               goto next_retain;
 
+            // Merge the CopyOnEscape values.
+            RetainsToMove.CopyOnEscape &= NewReleaseRetainRRI.CopyOnEscape;
+
             // Collect the optimal insertion points.
             if (!KnownSafe)
               for (SmallPtrSet<Instruction *, 2>::const_iterator
@@ -2787,12 +2959,19 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
       if (NewRetains.empty()) break;
     }
 
-    // If the pointer is known incremented, we can safely delete the pair
-    // regardless of what's between them.
-    if (KnownIncrementedTD || KnownIncrementedBU) {
+    // If the pointer is known incremented or nested, we can safely delete the
+    // pair regardless of what's between them.
+    if (KnownSafeTD || KnownSafeBU) {
       RetainsToMove.ReverseInsertPts.clear();
       ReleasesToMove.ReverseInsertPts.clear();
       NewCount = 0;
+    } else {
+      // Determine whether the new insertion points we computed preserve the
+      // balance of retain and release calls through the program.
+      // TODO: If the fully aggressive solution isn't valid, try to find a
+      // less aggressive solution which is.
+      if (NewDelta != 0)
+        goto next_retain;
     }
 
     // Determine whether the original call points are balanced in the retain and
@@ -2803,18 +2982,12 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
     if (OldDelta != 0)
       goto next_retain;
 
-    // Determine whether the new insertion points we computed preserve the
-    // balance of retain and release calls through the program.
-    // TODO: If the fully aggressive solution isn't valid, try to find a
-    // less aggressive solution which is.
-    if (NewDelta != 0)
-      goto next_retain;
-
     // Ok, everything checks out and we're all set. Let's move some code!
     Changed = true;
     AnyPairsCompletelyEliminated = NewCount == 0;
     NumRRs += OldCount - NewCount;
-    MoveCalls(Arg, RetainsToMove, ReleasesToMove, Retains, Releases, DeadInsts);
+    MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+              Retains, Releases, DeadInsts, M);
 
   next_retain:
     NewReleases.clear();
@@ -2993,7 +3166,8 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
   bool NestingDetected = Visit(F, BBStates, Retains, Releases);
 
   // Transform.
-  return PerformCodePlacement(BBStates, Retains, Releases) && NestingDetected;
+  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
+         NestingDetected;
 }
 
 /// OptimizeReturns - Look for this pattern:
@@ -3072,7 +3246,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
 
         // Check that there is nothing that can affect the reference
         // count between the retain and the call.
-        FindDependencies(CanChangeRetainCount, Arg, BB, Retain,
+        // Note that Retain need not be in BB.
+        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
                          DependingInstructions, Visited, PA);
         if (DependingInstructions.size() != 1)
           goto next_block;
@@ -3116,12 +3291,8 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   // Identify the imprecise release metadata kind.
   ImpreciseReleaseMDKind =
     M.getContext().getMDKindID("clang.imprecise_release");
-
-  // Identify the declarations for objc_retain and friends.
-  RetainFunc = M.getFunction("objc_retain");
-  RetainBlockFunc = M.getFunction("objc_retainBlock");
-  RetainRVFunc = M.getFunction("objc_retainAutoreleasedReturnValue");
-  ReleaseFunc = M.getFunction("objc_release");
+  CopyOnEscapeMDKind =
+    M.getContext().getMDKindID("clang.arc.copy_on_escape");
 
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
@@ -3132,6 +3303,7 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   AutoreleaseRVCallee = 0;
   ReleaseCallee = 0;
   RetainCallee = 0;
+  RetainBlockCallee = 0;
   AutoreleaseCallee = 0;
 
   return false;
@@ -3377,7 +3549,7 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
 void ObjCARCContract::ContractRelease(Instruction *Release,
                                       inst_iterator &Iter) {
   LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
-  if (!Load || Load->isVolatile()) return;
+  if (!Load || !Load->isSimple()) return;
 
   // For now, require everything to be in one basic block.
   BasicBlock *BB = Release->getParent();
@@ -3393,7 +3565,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
           !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod)))
     ++I;
   StoreInst *Store = dyn_cast<StoreInst>(I);
-  if (!Store || Store->isVolatile()) return;
+  if (!Store || !Store->isSimple()) return;
   if (Store->getPointerOperand() != Loc.Ptr) return;
 
   Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index e6341ae..8f98a5b 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -309,7 +309,7 @@ void Reassociate::LinearizeExprTree(BinaryOperator *I,
     std::swap(LHS, RHS);
     bool Success = !I->swapOperands();
     assert(Success && "swapOperands failed");
-    Success = false;
+    (void)Success;
     MadeChange = true;
   } else if (RHSBO) {
     // Turn (A+B)+(C+D) -> (((A+B)+C)+D).  This guarantees the RHS is not
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 5b12c92..196a847 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -156,7 +156,7 @@ namespace {
 ///
 class SCCPSolver : public InstVisitor<SCCPSolver> {
   const TargetData *TD;
-  SmallPtrSet<BasicBlock*, 8> BBExecutable;// The BBs that are executable.
+  SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
   DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
 
   /// StructValueState - This maintains ValueState for values that have
@@ -471,9 +471,9 @@ private:
   /// UsersOfOverdefinedPHIs map for PN, remove them now.
   void RemoveFromOverdefinedPHIs(Instruction *I, PHINode *PN) {
     if (UsersOfOverdefinedPHIs.empty()) return;
-    std::multimap<PHINode*, Instruction*>::iterator It, E;
-    tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN);
-    while (It != E) {
+    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
+    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN);
+    for (ItTy It = Range.first, E = Range.second; It != E;) {
       if (It->second == I)
         UsersOfOverdefinedPHIs.erase(It++);
       else
@@ -486,9 +486,9 @@ private:
   /// (Duplicate entries do not break anything directly, but can lead to
   /// exponential growth of the table in rare cases.)
   void InsertInOverdefinedPHIs(Instruction *I, PHINode *PN) {
-    std::multimap<PHINode*, Instruction*>::iterator J, E;
-    tie(J, E) = UsersOfOverdefinedPHIs.equal_range(PN);
-    for (; J != E; ++J)
+    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
+    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN);
+    for (ItTy J = Range.first, E = Range.second; J != E; ++J)
       if (J->second == I)
         return;
     UsersOfOverdefinedPHIs.insert(std::make_pair(PN, I));
@@ -515,6 +515,7 @@ private:
   void visitShuffleVectorInst(ShuffleVectorInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
+  void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
 
   // Instructions that cannot be folded away.
   void visitStoreInst     (StoreInst &I);
@@ -528,8 +529,12 @@ private:
     visitTerminatorInst(II);
   }
   void visitCallSite      (CallSite CS);
+  void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnwindInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitFenceInst     (FenceInst &I) { /*returns void*/ }
+  void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); }
+  void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
   void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
   void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
 
@@ -577,6 +582,10 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
   }
   
   if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+    if (TI.getNumSuccessors() < 2) {
+      Succs[0] = true;
+      return;
+    }
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
     
@@ -637,6 +646,9 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     return true;
   
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    if (SI->getNumSuccessors() < 2)
+      return true;
+
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
     
@@ -692,13 +704,14 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
     // There may be instructions using this PHI node that are not overdefined
     // themselves.  If so, make sure that they know that the PHI node operand
     // changed.
-    std::multimap<PHINode*, Instruction*>::iterator I, E;
-    tie(I, E) = UsersOfOverdefinedPHIs.equal_range(&PN);
-    if (I == E)
+    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
+    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(&PN);
+    
+    if (Range.first == Range.second)
       return;
     
     SmallVector<Instruction*, 16> Users;
-    for (; I != E; ++I)
+    for (ItTy I = Range.first, E = Range.second; I != E; ++I)
       Users.push_back(I->second);
     while (!Users.empty())
       visit(Users.pop_back_val());
@@ -1179,8 +1192,8 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
   }
 
   Constant *Ptr = Operands[0];
-  markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, &Operands[0]+1,
-                                                  Operands.size()-1));
+  ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end());
+  markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices));
 }
 
 void SCCPSolver::visitStoreInst(StoreInst &SI) {
@@ -1420,66 +1433,115 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       if (I->getType()->isVoidTy()) continue;
       
       if (StructType *STy = dyn_cast<StructType>(I->getType())) {
-        // Only a few things that can be structs matter for undef.  Just send
-        // all their results to overdefined.  We could be more precise than this
-        // but it isn't worth bothering.
-        if (isa<CallInst>(I) || isa<SelectInst>(I)) {
-          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-            LatticeVal &LV = getStructValueState(I, i);
-            if (LV.isUndefined())
-              markOverdefined(LV, I);
-          }
+        // Only a few things that can be structs matter for undef.
+
+        // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+        if (CallSite CS = CallSite(I))
+          if (Function *F = CS.getCalledFunction())
+            if (MRVFunctionsTracked.count(F))
+              continue;
+
+        // extractvalue and insertvalue don't need to be marked; they are
+        // tracked as precisely as their operands. 
+        if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+          continue;
+
+        // Send the results of everything else to overdefined.  We could be
+        // more precise than this but it isn't worth bothering.
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          LatticeVal &LV = getStructValueState(I, i);
+          if (LV.isUndefined())
+            markOverdefined(LV, I);
         }
         continue;
       }
-      
+
       LatticeVal &LV = getValueState(I);
       if (!LV.isUndefined()) continue;
 
-      // No instructions using structs need disambiguation.
-      if (I->getOperand(0)->getType()->isStructTy())
+      // extractvalue is safe; check here because the argument is a struct.
+      if (isa<ExtractValueInst>(I))
         continue;
 
-      // Get the lattice values of the first two operands for use below.
+      // Compute the operand LatticeVals, for convenience below.
+      // Anything taking a struct is conservatively assumed to require
+      // overdefined markings.
+      if (I->getOperand(0)->getType()->isStructTy()) {
+        markOverdefined(I);
+        return true;
+      }
       LatticeVal Op0LV = getValueState(I->getOperand(0));
       LatticeVal Op1LV;
       if (I->getNumOperands() == 2) {
-        // No instructions using structs need disambiguation.
-        if (I->getOperand(1)->getType()->isStructTy())
-          continue;
-        
-        // If this is a two-operand instruction, and if both operands are
-        // undefs, the result stays undef.
+        if (I->getOperand(1)->getType()->isStructTy()) {
+          markOverdefined(I);
+          return true;
+        }
+
         Op1LV = getValueState(I->getOperand(1));
-        if (Op0LV.isUndefined() && Op1LV.isUndefined())
-          continue;
       }
-      
       // If this is an instructions whose result is defined even if the input is
       // not fully defined, propagate the information.
       Type *ITy = I->getType();
       switch (I->getOpcode()) {
-      default: break;          // Leave the instruction as an undef.
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast:
+        break; // Any undef -> undef
+      case Instruction::FSub:
+      case Instruction::FAdd:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+        // Floating-point binary operation: be conservative.
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          markForcedConstant(I, Constant::getNullValue(ITy));
+        else
+          markOverdefined(I);
+        return true;
       case Instruction::ZExt:
-        // After a zero extend, we know the top part is zero.  SExt doesn't have
-        // to be handled here, because we don't know whether the top part is 1's
-        // or 0's.
-      case Instruction::SIToFP:  // some FP values are not possible, just use 0.
-      case Instruction::UIToFP:  // some FP values are not possible, just use 0.
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        // undef -> 0; some outputs are impossible
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
       case Instruction::Mul:
       case Instruction::And:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
         // undef * X -> 0.   X could be zero.
         // undef & X -> 0.   X could be zero.
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
 
       case Instruction::Or:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
         // undef | X -> -1.   X could be -1.
         markForcedConstant(I, Constant::getAllOnesValue(ITy));
         return true;
 
+      case Instruction::Xor:
+        // undef ^ undef -> 0; strictly speaking, this is not strictly
+        // necessary, but we try to be nice to people who expect this
+        // behavior in simple cases
+        if (Op0LV.isUndefined() && Op1LV.isUndefined()) {
+          markForcedConstant(I, Constant::getNullValue(ITy));
+          return true;
+        }
+        // undef ^ X -> undef
+        break;
+
       case Instruction::SDiv:
       case Instruction::UDiv:
       case Instruction::SRem:
@@ -1494,26 +1556,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         return true;
         
       case Instruction::AShr:
-        // undef >>s X -> undef.  No change.
-        if (Op0LV.isUndefined()) break;
-        
-        // X >>s undef -> X.  X could be 0, X could have the high-bit known set.
-        if (Op0LV.isConstant())
-          markForcedConstant(I, Op0LV.getConstant());
-        else
-          markOverdefined(I);
+        // X >>a undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef >>a X -> all ones
+        markForcedConstant(I, Constant::getAllOnesValue(ITy));
         return true;
       case Instruction::LShr:
       case Instruction::Shl:
-        // undef >> X -> undef.  No change.
-        // undef << X -> undef.  No change.
-        if (Op0LV.isUndefined()) break;
-        
-        // X >> undef -> 0.  X could be 0.
-        // X << undef -> 0.  X could be 0.
+        // X << undef -> undef.
+        // X >> undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef << X -> 0
+        // undef >> X -> 0
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
       case Instruction::Select:
+        Op1LV = getValueState(I->getOperand(1));
         // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
         if (Op0LV.isUndefined()) {
           if (!Op1LV.isConstant())  // Pick the constant one if there is any.
@@ -1533,9 +1593,35 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         else
           markOverdefined(I);
         return true;
+      case Instruction::Load:
+        // A load here means one of two things: a load of undef from a global,
+        // a load from an unknown pointer.  Either way, having it return undef
+        // is okay.
+        break;
+      case Instruction::ICmp:
+        // X == undef -> undef.  Other comparisons get more complicated.
+        if (cast<ICmpInst>(I)->isEquality())
+          break;
+        markOverdefined(I);
+        return true;
       case Instruction::Call:
-        // If a call has an undef result, it is because it is constant foldable
-        // but one of the inputs was undef.  Just force the result to
+      case Instruction::Invoke: {
+        // There are two reasons a call can have an undef result
+        // 1. It could be tracked.
+        // 2. It could be constant-foldable.
+        // Because of the way we solve return values, tracked calls must
+        // never be marked overdefined in ResolvedUndefsIn.
+        if (Function *F = CallSite(I).getCalledFunction())
+          if (TrackedRetVals.count(F))
+            break;
+
+        // If the call is constant-foldable, we mark it overdefined because
+        // we do not know what return values are valid.
+        markOverdefined(I);
+        return true;
+      }
+      default:
+        // If we don't know what should happen here, conservatively mark it
         // overdefined.
         markOverdefined(I);
         return true;
@@ -1621,15 +1707,25 @@ FunctionPass *llvm::createSCCPPass() {
 static void DeleteInstructionInBlock(BasicBlock *BB) {
   DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
   ++NumDeadBlocks;
-  
-  // Delete the instructions backwards, as it has a reduced likelihood of
-  // having to update as many def-use and use-def chains.
-  while (!isa<TerminatorInst>(BB->begin())) {
-    Instruction *I = --BasicBlock::iterator(BB->getTerminator());
-    
-    if (!I->use_empty())
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
-    BB->getInstList().erase(I);
+
+  // Check to see if there are non-terminating instructions to delete.
+  if (isa<TerminatorInst>(BB->begin()))
+    return;
+
+  // Delete the instructions backwards, as it has a reduced likelihood of having
+  // to update as many def-use and use-def chains.
+  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+  while (EndInst != BB->begin()) {
+    // Delete the next to last instruction.
+    BasicBlock::iterator I = EndInst;
+    Instruction *Inst = --I;
+    if (!Inst->use_empty())
+      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    if (isa<LandingPadInst>(Inst)) {
+      EndInst = Inst;
+      continue;
+    }
+    BB->getInstList().erase(Inst);
     ++NumInstRemoved;
   }
 }
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 302c287..f6918de 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -63,7 +63,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeCFGSimplifyPassPass(Registry);
   initializeSimplifyLibCallsPass(Registry);
   initializeSinkingPass(Registry);
-  initializeTailDupPass(Registry);
   initializeTailCallElimPass(Registry);
 }
 
@@ -187,3 +186,7 @@ void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
 void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBasicAliasAnalysisPass());
 }
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index fbf3092..c6d9123 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -145,6 +145,9 @@ namespace {
                         SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                     SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                  uint64_t Offset,
+                                  SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
                                       AllocaInst *AI,
                                       SmallVector<AllocaInst*, 32> &NewElts);
@@ -295,8 +298,6 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   if (ScalarKind == Unknown)
     ScalarKind = Integer;
 
-  // FIXME: It should be possible to promote the vector type up to the alloca's
-  // size.
   if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8)
     ScalarKind = Integer;
 
@@ -331,16 +332,12 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
 /// (VectorTy) so far at the offset specified by Offset (which is specified in
 /// bytes).
 ///
-/// There are three cases we handle here:
+/// There are two cases we handle here:
 ///   1) A union of vector types of the same size and potentially its elements.
 ///      Here we turn element accesses into insert/extract element operations.
 ///      This promotes a <4 x float> with a store of float to the third element
 ///      into a <4 x float> that uses insert element.
-///   2) A union of vector types with power-of-2 size differences, e.g. a float,
-///      <2 x float> and <4 x float>.  Here we turn element accesses into insert
-///      and extract element operations, and <2 x float> accesses into a cast to
-///      <2 x double>, an extract, and a cast back to <2 x float>.
-///   3) A fully general blob of memory, which we turn into some (potentially
+///   2) A fully general blob of memory, which we turn into some (potentially
 ///      large) integer type with extract and insert operations where the loads
 ///      and stores would mutate the memory.  We mark this by setting VectorTy
 ///      to VoidTy.
@@ -371,20 +368,13 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In,
     // if the implied vector agrees with what we already have and if Offset is
     // compatible with it.
     if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
-        (!VectorTy || Offset * 8 < VectorTy->getPrimitiveSizeInBits())) {
+        (!VectorTy || EltSize == VectorTy->getElementType()
+                                         ->getPrimitiveSizeInBits()/8)) {
       if (!VectorTy) {
         ScalarKind = ImplicitVector;
         VectorTy = VectorType::get(In, AllocaSize/EltSize);
-        return;
       }
-
-      unsigned CurrentEltSize = VectorTy->getElementType()
-                                ->getPrimitiveSizeInBits()/8;
-      if (EltSize == CurrentEltSize)
-        return;
-
-      if (In->isIntegerTy() && isPowerOf2_32(AllocaSize / EltSize))
-        return;
+      return;
     }
   }
 
@@ -397,72 +387,19 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In,
 /// returning true if the type was successfully merged and false otherwise.
 bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
                                             uint64_t Offset) {
-  // TODO: Support nonzero offsets?
-  if (Offset != 0)
-    return false;
-
-  // Only allow vectors that are a power-of-2 away from the size of the alloca.
-  if (!isPowerOf2_64(AllocaSize / (VInTy->getBitWidth() / 8)))
-    return false;
-
-  // If this the first vector we see, remember the type so that we know the
-  // element size.
-  if (!VectorTy) {
+  if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
+    // If we're storing/loading a vector of the right size, allow it as a
+    // vector.  If this the first vector we see, remember the type so that
+    // we know the element size. If this is a subsequent access, ignore it
+    // even if it is a differing type but the same size. Worst case we can
+    // bitcast the resultant vectors.
+    if (!VectorTy)
+      VectorTy = VInTy;
     ScalarKind = Vector;
-    VectorTy = VInTy;
     return true;
   }
 
-  unsigned BitWidth = VectorTy->getBitWidth();
-  unsigned InBitWidth = VInTy->getBitWidth();
-
-  // Vectors of the same size can be converted using a simple bitcast.
-  if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8)) {
-    ScalarKind = Vector;
-    return true;
-  }
-
-  Type *ElementTy = VectorTy->getElementType();
-  Type *InElementTy = VInTy->getElementType();
-
-  // Do not allow mixed integer and floating-point accesses from vectors of
-  // different sizes.
-  if (ElementTy->isFloatingPointTy() != InElementTy->isFloatingPointTy())
-    return false;
-
-  if (ElementTy->isFloatingPointTy()) {
-    // Only allow floating-point vectors of different sizes if they have the
-    // same element type.
-    // TODO: This could be loosened a bit, but would anything benefit?
-    if (ElementTy != InElementTy)
-      return false;
-
-    // There are no arbitrary-precision floating-point types, which limits the
-    // number of legal vector types with larger element types that we can form
-    // to bitcast and extract a subvector.
-    // TODO: We could support some more cases with mixed fp128 and double here.
-    if (!(BitWidth == 64 || BitWidth == 128) ||
-        !(InBitWidth == 64 || InBitWidth == 128))
-      return false;
-  } else {
-    assert(ElementTy->isIntegerTy() && "Vector elements must be either integer "
-                                       "or floating-point.");
-    unsigned BitWidth = ElementTy->getPrimitiveSizeInBits();
-    unsigned InBitWidth = InElementTy->getPrimitiveSizeInBits();
-
-    // Do not allow integer types smaller than a byte or types whose widths are
-    // not a multiple of a byte.
-    if (BitWidth < 8 || InBitWidth < 8 ||
-        BitWidth % 8 != 0 || InBitWidth % 8 != 0)
-      return false;
-  }
-
-  // Pick the largest of the two vector types.
-  ScalarKind = Vector;
-  if (InBitWidth > BitWidth)
-    VectorTy = VInTy;
-
-  return true;
+  return false;
 }
 
 /// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
@@ -480,7 +417,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       // Don't break volatile loads.
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return false;
       // Don't touch MMX operations.
       if (LI->getType()->isX86_MMXTy())
@@ -492,7 +429,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Storing the pointer, not into the value?
-      if (SI->getOperand(0) == V || SI->isVolatile()) return false;
+      if (SI->getOperand(0) == V || !SI->isSimple()) return false;
       // Don't touch MMX operations.
       if (SI->getOperand(0)->getType()->isX86_MMXTy())
         return false;
@@ -502,7 +439,8 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
     }
 
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
-      IsNotTrivial = true;  // Can't be mem2reg'd.
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        IsNotTrivial = true;  // Can't be mem2reg'd.
       if (!CanConvertToScalar(BCI, Offset))
         return false;
       continue;
@@ -560,6 +498,14 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
       continue;
     }
 
+    // If this is a lifetime intrinsic, we can handle it.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        continue;
+      }
+    }
+
     // Otherwise, we cannot handle this!
     return false;
   }
@@ -599,7 +545,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
 
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       // The load is a bit extract from NewAI shifted right by Offset bits.
-      Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp");
+      Value *LoadedVal = Builder.CreateLoad(NewAI);
       Value *NewLoadVal
         = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
       LI->replaceAllUsesWith(NewLoadVal);
@@ -703,65 +649,18 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       continue;
     }
 
-    llvm_unreachable("Unsupported operation!");
-  }
-}
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        // There's no need to preserve these, as the resulting alloca will be
+        // converted to a register anyways.
+        II->eraseFromParent();
+        continue;
+      }
+    }
 
-/// getScaledElementType - Gets a scaled element type for a partial vector
-/// access of an alloca. The input types must be integer or floating-point
-/// scalar or vector types, and the resulting type is an integer, float or
-/// double.
-static Type *getScaledElementType(Type *Ty1, Type *Ty2,
-                                        unsigned NewBitWidth) {
-  bool IsFP1 = Ty1->isFloatingPointTy() ||
-               (Ty1->isVectorTy() &&
-                cast<VectorType>(Ty1)->getElementType()->isFloatingPointTy());
-  bool IsFP2 = Ty2->isFloatingPointTy() ||
-               (Ty2->isVectorTy() &&
-                cast<VectorType>(Ty2)->getElementType()->isFloatingPointTy());
-
-  LLVMContext &Context = Ty1->getContext();
-
-  // Prefer floating-point types over integer types, as integer types may have
-  // been created by earlier scalar replacement.
-  if (IsFP1 || IsFP2) {
-    if (NewBitWidth == 32)
-      return Type::getFloatTy(Context);
-    if (NewBitWidth == 64)
-      return Type::getDoubleTy(Context);
+    llvm_unreachable("Unsupported operation!");
   }
-
-  return Type::getIntNTy(Context, NewBitWidth);
-}
-
-/// CreateShuffleVectorCast - Creates a shuffle vector to convert one vector
-/// to another vector of the same element type which has the same allocation
-/// size but different primitive sizes (e.g. <3 x i32> and <4 x i32>).
-static Value *CreateShuffleVectorCast(Value *FromVal, Type *ToType,
-                                      IRBuilder<> &Builder) {
-  Type *FromType = FromVal->getType();
-  VectorType *FromVTy = cast<VectorType>(FromType);
-  VectorType *ToVTy = cast<VectorType>(ToType);
-  assert((ToVTy->getElementType() == FromVTy->getElementType()) &&
-         "Vectors must have the same element type");
-   Value *UnV = UndefValue::get(FromType);
-   unsigned numEltsFrom = FromVTy->getNumElements();
-   unsigned numEltsTo = ToVTy->getNumElements();
-
-   SmallVector<Constant*, 3> Args;
-   Type* Int32Ty = Builder.getInt32Ty();
-   unsigned minNumElts = std::min(numEltsFrom, numEltsTo);
-   unsigned i;
-   for (i=0; i != minNumElts; ++i)
-     Args.push_back(ConstantInt::get(Int32Ty, i));
-
-   if (i < numEltsTo) {
-     Constant* UnC = UndefValue::get(Int32Ty);
-     for (; i != numEltsTo; ++i)
-       Args.push_back(UnC);
-   }
-   Constant *Mask = ConstantVector::get(Args);
-   return Builder.CreateShuffleVector(FromVal, UnV, Mask, "tmpV");
 }
 
 /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
@@ -787,38 +686,8 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
   if (VectorType *VTy = dyn_cast<VectorType>(FromType)) {
     unsigned FromTypeSize = TD.getTypeAllocSize(FromType);
     unsigned ToTypeSize = TD.getTypeAllocSize(ToType);
-    if (FromTypeSize == ToTypeSize) {
-      // If the two types have the same primitive size, use a bit cast.
-      // Otherwise, it is two vectors with the same element type that has
-      // the same allocation size but different number of elements so use
-      // a shuffle vector.
-      if (FromType->getPrimitiveSizeInBits() ==
-          ToType->getPrimitiveSizeInBits())
-        return Builder.CreateBitCast(FromVal, ToType, "tmp");
-      else
-        return CreateShuffleVectorCast(FromVal, ToType, Builder);
-    }
-
-    if (isPowerOf2_64(FromTypeSize / ToTypeSize)) {
-      assert(!(ToType->isVectorTy() && Offset != 0) && "Can't extract a value "
-             "of a smaller vector type at a nonzero offset.");
-
-      Type *CastElementTy = getScaledElementType(FromType, ToType,
-                                                       ToTypeSize * 8);
-      unsigned NumCastVectorElements = FromTypeSize / ToTypeSize;
-
-      LLVMContext &Context = FromVal->getContext();
-      Type *CastTy = VectorType::get(CastElementTy,
-                                           NumCastVectorElements);
-      Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp");
-
-      unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy);
-      unsigned Elt = Offset/EltSize;
-      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
-      Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get(
-                                        Type::getInt32Ty(Context), Elt), "tmp");
-      return Builder.CreateBitCast(Extract, ToType, "tmp");
-    }
+    if (FromTypeSize == ToTypeSize)
+        return Builder.CreateBitCast(FromVal, ToType);
 
     // Otherwise it must be an element access.
     unsigned Elt = 0;
@@ -828,10 +697,9 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
       assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
     }
     // Return the element extracted out of it.
-    Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get(
-                    Type::getInt32Ty(FromVal->getContext()), Elt), "tmp");
+    Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt));
     if (V->getType() != ToType)
-      V = Builder.CreateBitCast(V, ToType, "tmp");
+      V = Builder.CreateBitCast(V, ToType);
     return V;
   }
 
@@ -844,7 +712,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
       Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
                                         Offset+Layout.getElementOffsetInBits(i),
                                               Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+      Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
   }
@@ -855,7 +723,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
                                               Offset+i*EltSize, Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+      Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
   }
@@ -881,33 +749,31 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
   // only some bits are used.
   if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
     FromVal = Builder.CreateLShr(FromVal,
-                                 ConstantInt::get(FromVal->getType(),
-                                                           ShAmt), "tmp");
+                                 ConstantInt::get(FromVal->getType(), ShAmt));
   else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
     FromVal = Builder.CreateShl(FromVal,
-                                ConstantInt::get(FromVal->getType(),
-                                                          -ShAmt), "tmp");
+                                ConstantInt::get(FromVal->getType(), -ShAmt));
 
   // Finally, unconditionally truncate the integer to the right width.
   unsigned LIBitWidth = TD.getTypeSizeInBits(ToType);
   if (LIBitWidth < NTy->getBitWidth())
     FromVal =
       Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth), "tmp");
+                                                    LIBitWidth));
   else if (LIBitWidth > NTy->getBitWidth())
     FromVal =
        Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth), "tmp");
+                                                    LIBitWidth));
 
   // If the result is an integer, this is a trunc or bitcast.
   if (ToType->isIntegerTy()) {
     // Should be done.
   } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) {
     // Just do a bitcast, we know the sizes match up.
-    FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp");
+    FromVal = Builder.CreateBitCast(FromVal, ToType);
   } else {
     // Otherwise must be a pointer.
-    FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp");
+    FromVal = Builder.CreateIntToPtr(FromVal, ToType);
   }
   assert(FromVal->getType() == ToType && "Didn't convert right?");
   return FromVal;
@@ -936,56 +802,21 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
 
     // Changing the whole vector with memset or with an access of a different
     // vector type?
-    if (ValSize == VecSize) {
-      // If the two types have the same primitive size, use a bit cast.
-      // Otherwise, it is two vectors with the same element type that has
-      // the same allocation size but different number of elements so use
-      // a shuffle vector.
-      if (VTy->getPrimitiveSizeInBits() ==
-          SV->getType()->getPrimitiveSizeInBits())
-        return Builder.CreateBitCast(SV, AllocaType, "tmp");
-      else
-        return CreateShuffleVectorCast(SV, VTy, Builder);
-    }
-
-    if (isPowerOf2_64(VecSize / ValSize)) {
-      assert(!(SV->getType()->isVectorTy() && Offset != 0) && "Can't insert a "
-             "value of a smaller vector type at a nonzero offset.");
-
-      Type *CastElementTy = getScaledElementType(VTy, SV->getType(),
-                                                       ValSize);
-      unsigned NumCastVectorElements = VecSize / ValSize;
-
-      LLVMContext &Context = SV->getContext();
-      Type *OldCastTy = VectorType::get(CastElementTy,
-                                              NumCastVectorElements);
-      Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp");
-
-      Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp");
-
-      unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy);
-      unsigned Elt = Offset/EltSize;
-      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
-      Value *Insert =
-        Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get(
-                                        Type::getInt32Ty(Context), Elt), "tmp");
-      return Builder.CreateBitCast(Insert, AllocaType, "tmp");
-    }
+    if (ValSize == VecSize)
+        return Builder.CreateBitCast(SV, AllocaType);
 
     // Must be an element insertion.
     assert(SV->getType() == VTy->getElementType());
     uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
     unsigned Elt = Offset/EltSize;
-    return Builder.CreateInsertElement(Old, SV,
-                     ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt),
-                                     "tmp");
+    return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt));
   }
 
   // If SV is a first-class aggregate value, insert each value recursively.
   if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
     const StructLayout &Layout = *TD.getStructLayout(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old,
                                       Offset+Layout.getElementOffsetInBits(i),
                                       Builder);
@@ -996,7 +827,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
     }
     return Old;
@@ -1009,20 +840,19 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType());
   unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType);
   if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
-    SV = Builder.CreateBitCast(SV,
-                            IntegerType::get(SV->getContext(),SrcWidth), "tmp");
+    SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
   else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp");
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
 
   // Zero extend or truncate the value if needed.
   if (SV->getType() != AllocaType) {
     if (SV->getType()->getPrimitiveSizeInBits() <
              AllocaType->getPrimitiveSizeInBits())
-      SV = Builder.CreateZExt(SV, AllocaType, "tmp");
+      SV = Builder.CreateZExt(SV, AllocaType);
     else {
       // Truncation may be needed if storing more than the alloca can hold
       // (undefined behavior).
-      SV = Builder.CreateTrunc(SV, AllocaType, "tmp");
+      SV = Builder.CreateTrunc(SV, AllocaType);
       SrcWidth = DestWidth;
       SrcStoreWidth = DestStoreWidth;
     }
@@ -1045,12 +875,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   // only some bits in the structure are set.
   APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
   if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
-    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(),
-                           ShAmt), "tmp");
+    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt));
     Mask <<= ShAmt;
   } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
-    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(),
-                            -ShAmt), "tmp");
+    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt));
     Mask = Mask.lshr(-ShAmt);
   }
 
@@ -1196,7 +1024,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
   for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
-    if (LI == 0 || LI->isVolatile()) return false;
+    if (LI == 0 || !LI->isSimple()) return false;
     
     // Both operands to the select need to be dereferencable, either absolutely
     // (e.g. allocas) or at this point because we can see other accesses to it.
@@ -1237,7 +1065,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
   for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
-    if (LI == 0 || LI->isVolatile()) return false;
+    if (LI == 0 || !LI->isSimple()) return false;
     
     // For now we only allow loads in the same block as the PHI.  This is a
     // common case that happens when instcombine merges two loads through a PHI.
@@ -1258,17 +1086,21 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
   // trapping load in the predecessor if it is a critical edge.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *Pred = PN->getIncomingBlock(i);
+    Value *InVal = PN->getIncomingValue(i);
+
+    // If the terminator of the predecessor has side-effects (an invoke),
+    // there is no safe place to put a load in the predecessor.
+    if (Pred->getTerminator()->mayHaveSideEffects())
+      return false;
+
+    // If the value is produced by the terminator of the predecessor
+    // (an invoke), there is no valid place to put a load in the predecessor.
+    if (Pred->getTerminator() == InVal)
+      return false;
 
     // If the predecessor has a single successor, then the edge isn't critical.
     if (Pred->getTerminator()->getNumSuccessors() == 1)
       continue;
-    
-    Value *InVal = PN->getIncomingValue(i);
-    
-    // If the InVal is an invoke in the pred, we can't put a load on the edge.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
-      if (II->getParent() == Pred)
-        return false;
 
     // If this pointer is always safe to load, or if we can prove that there is
     // already a load in the block, then we can move the load to the pred block.
@@ -1295,13 +1127,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
        UI != UE; ++UI) {
     User *U = *UI;
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return false;
       continue;
     }
     
     if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      if (SI->getOperand(0) == AI || SI->isVolatile())
+      if (SI->getOperand(0) == AI || !SI->isSimple())
         return false;   // Don't allow a store OF the AI, only INTO the AI.
       continue;
     }
@@ -1343,6 +1175,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       continue;
     }
     
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (onlyUsedByLifetimeMarkers(BCI)) {
+        InstsToRewrite.insert(BCI);
+        continue;
+      }
+    }
+    
     return false;
   }
 
@@ -1354,6 +1193,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
   // If we have instructions that need to be rewritten for this to be promotable
   // take care of it now.
   for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) {
+      // This could only be a bitcast used by nothing but lifetime intrinsics.
+      for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end();
+           I != E;) {
+        Use &U = I.getUse();
+        ++I;
+        cast<Instruction>(U.getUser())->eraseFromParent();
+      }
+      BCI->eraseFromParent();
+      continue;
+    }
+
     if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) {
       // Selects in InstsToRewrite only have load uses.  Rewrite each as two
       // loads with a new select.
@@ -1670,7 +1521,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
                       UI.getOperandNo() == 0, Info, MI,
                       true /*AllowWholeAccess*/);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return MarkUnsafe(Info, User);
       Type *LIType = LI->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
@@ -1679,13 +1530,17 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
         
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
-      if (SI->isVolatile() || SI->getOperand(0) == I)
+      if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
         
       Type *SIType = SI->getOperand(0)->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return MarkUnsafe(Info, User);
     } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
       isSafePHISelectUseForScalarRepl(User, Offset, Info);
     } else {
@@ -1725,7 +1580,7 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
         return MarkUnsafe(Info, User);
       isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return MarkUnsafe(Info, User);
       Type *LIType = LI->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
@@ -1734,7 +1589,7 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
       
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
-      if (SI->isVolatile() || SI->getOperand(0) == I)
+      if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
       
       Type *SIType = SI->getOperand(0)->getType();
@@ -1923,6 +1778,14 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       // address operand will be updated, so nothing else needs to be done.
       continue;
     }
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        RewriteLifetimeIntrinsic(II, AI, Offset, NewElts);
+      }
+      continue;
+    }
     
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       Type *LIType = LI->getType();
@@ -2080,8 +1943,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   }
   Instruction *Val = NewElts[Idx];
   if (NewArgs.size() > 1) {
-    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs.begin(),
-                                            NewArgs.end(), "", GEPI);
+    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
     Val->takeName(GEPI);
   }
   if (Val->getType() != GEPI->getType())
@@ -2090,6 +1952,62 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   DeadInsts.push_back(GEPI);
 }
 
+/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it
+/// to mark the lifetime of the scalarized memory.
+void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                    uint64_t Offset,
+                                    SmallVector<AllocaInst*, 32> &NewElts) {
+  ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
+  // Put matching lifetime markers on everything from Offset up to
+  // Offset+OldSize.
+  Type *AIType = AI->getAllocatedType();
+  uint64_t NewOffset = Offset;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy);
+
+  IRBuilder<> Builder(II);
+  uint64_t Size = OldSize->getLimitedValue();
+
+  if (NewOffset) {
+    // Splice the first element and index 'NewOffset' bytes in.  SROA will
+    // split the alloca again later.
+    Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy());
+    V = Builder.CreateGEP(V, Builder.getInt64(NewOffset));
+
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset;
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize));
+    ++Idx;
+  }
+
+  for (; Idx != NewElts.size() && Size; ++Idx) {
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy);
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(NewElts[Idx],
+                                  Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(NewElts[Idx],
+                                Builder.getInt64(EltSize));
+  }
+  DeadInsts.push_back(II);
+}
+
 /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
 /// Rewrite it to copy or set the elements of the scalarized memory.
 void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
@@ -2157,7 +2075,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     if (OtherPtr) {
       Value *Idx[2] = { Zero,
                       ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
-      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, Idx + 2,
+      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx,
                                               OtherPtr->getName()+"."+Twine(i),
                                                    MI);
       uint64_t EltOffset;
@@ -2226,8 +2144,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
           assert(StoreVal->getType() == ValTy && "Type mismatch!");
 
           // If the requested value was a vector constant, create it.
-          if (EltTy != ValTy) {
-            unsigned NumElts = cast<VectorType>(ValTy)->getNumElements();
+          if (EltTy->isVectorTy()) {
+            unsigned NumElts = cast<VectorType>(EltTy)->getNumElements();
             SmallVector<Constant*, 16> Elts(NumElts, StoreVal);
             StoreVal = ConstantVector::get(Elts);
           }
@@ -2574,7 +2492,7 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
 
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       // Ignore non-volatile loads, they are always ok.
-      if (LI->isVolatile()) return false;
+      if (!LI->isSimple()) return false;
       continue;
     }
 
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index ad52417..fbb9465 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -338,16 +338,17 @@ struct StrCmpOpt : public LibCallOptimization {
     bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
     bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
 
-    if (HasStr1 && Str1.empty()) // strcmp("", x) -> *x
-      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
-
-    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
-      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
-
     // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
     if (HasStr1 && HasStr2)
       return ConstantInt::get(CI->getType(),
-                                     strcmp(Str1.c_str(),Str2.c_str()));
+                              StringRef(Str1).compare(Str2));
+
+    if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
+
+    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
 
     // strcmp(P, "x") -> memcmp(P, "x", 2)
     uint64_t Len1 = GetStringLength(Str1P);
@@ -400,16 +401,20 @@ struct StrNCmpOpt : public LibCallOptimization {
     bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
     bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
 
-    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> *x
-      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
+    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2) {
+      StringRef SubStr1 = StringRef(Str1).substr(0, Length);
+      StringRef SubStr2 = StringRef(Str2).substr(0, Length);
+      return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+    }
+
+    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
 
     if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
       return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
 
-    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
-    if (HasStr1 && HasStr2)
-      return ConstantInt::get(CI->getType(),
-                              strncmp(Str1.c_str(), Str2.c_str(), Length));
     return 0;
   }
 };
@@ -874,8 +879,8 @@ struct PowOpt : public LibCallOptimization {
                                          Callee->getAttributes());
       Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B,
                                          Callee->getAttributes());
-      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf, "tmp");
-      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs, "tmp");
+      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
+      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
       return Sel;
     }
 
@@ -908,10 +913,10 @@ struct Exp2Opt : public LibCallOptimization {
     Value *LdExpArg = 0;
     if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
-        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty(), "tmp");
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
     } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
-        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty(), "tmp");
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
     }
 
     if (LdExpArg) {
@@ -996,10 +1001,10 @@ struct FFSOpt : public LibCallOptimization {
     Value *F = Intrinsic::getDeclaration(Callee->getParent(),
                                          Intrinsic::cttz, ArgType);
     Value *V = B.CreateCall(F, Op, "cttz");
-    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1), "tmp");
-    V = B.CreateIntCast(V, B.getInt32Ty(), false, "tmp");
+    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+    V = B.CreateIntCast(V, B.getInt32Ty(), false);
 
-    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp");
+    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
     return B.CreateSelect(Cond, V, B.getInt32(0));
   }
 };
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 705f442..c83f56c 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -153,9 +153,13 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
 
 static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
                          SmallPtrSet<Instruction *, 8> &Stores) {
-  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
-    if (L->isVolatile()) return false;
 
+  if (Inst->mayWriteToMemory()) {
+    Stores.insert(Inst);
+    return false;
+  }
+
+  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
     AliasAnalysis::Location Loc = AA->getLocation(L);
     for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(),
          E = Stores.end(); I != E; ++I)
@@ -163,11 +167,6 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
         return false;
   }
 
-  if (Inst->mayWriteToMemory()) {
-    Stores.insert(Inst);
-    return false;
-  }
-
   if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst))
     return false;
 
diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
deleted file mode 100644
index 9dd83c0..0000000
--- a/lib/Transforms/Scalar/TailDuplication.cpp
+++ /dev/null
@@ -1,373 +0,0 @@
-//===- TailDuplication.cpp - Simplify CFG through tail duplication --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs a limited form of tail duplication, intended to simplify
-// CFGs by removing some unconditional branches.  This pass is necessary to
-// straighten out loops created by the C front-end, but also is capable of
-// making other code nicer.  After this pass is run, the CFG simplify pass
-// should be run to clean up the mess.
-//
-// This pass could be enhanced in the future to use profile information to be
-// more aggressive.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "tailduplicate"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Constant.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
-#include "llvm/Type.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <map>
-using namespace llvm;
-
-STATISTIC(NumEliminated, "Number of unconditional branches eliminated");
-
-static cl::opt<unsigned>
-TailDupThreshold("taildup-threshold",
-                 cl::desc("Max block size to tail duplicate"),
-                 cl::init(1), cl::Hidden);
-
-namespace {
-  class TailDup : public FunctionPass {
-    bool runOnFunction(Function &F);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    TailDup() : FunctionPass(ID) {
-      initializeTailDupPass(*PassRegistry::getPassRegistry());
-    }
-
-  private:
-    inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned);
-    inline void eliminateUnconditionalBranch(BranchInst *BI);
-    SmallPtrSet<BasicBlock*, 4> CycleDetector;
-  };
-}
-
-char TailDup::ID = 0;
-INITIALIZE_PASS(TailDup, "tailduplicate", "Tail Duplication", false, false)
-
-// Public interface to the Tail Duplication pass
-FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); }
-
-/// runOnFunction - Top level algorithm - Loop over each unconditional branch in
-/// the function, eliminating it if it looks attractive enough.  CycleDetector
-/// prevents infinite loops by checking that we aren't redirecting a branch to
-/// a place it already pointed to earlier; see PR 2323.
-bool TailDup::runOnFunction(Function &F) {
-  bool Changed = false;
-  CycleDetector.clear();
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
-    if (shouldEliminateUnconditionalBranch(I->getTerminator(),
-                                           TailDupThreshold)) {
-      eliminateUnconditionalBranch(cast<BranchInst>(I->getTerminator()));
-      Changed = true;
-    } else {
-      ++I;
-      CycleDetector.clear();
-    }
-  }
-  return Changed;
-}
-
-/// shouldEliminateUnconditionalBranch - Return true if this branch looks
-/// attractive to eliminate.  We eliminate the branch if the destination basic
-/// block has <= 5 instructions in it, not counting PHI nodes.  In practice,
-/// since one of these is a terminator instruction, this means that we will add
-/// up to 4 instructions to the new block.
-///
-/// We don't count PHI nodes in the count since they will be removed when the
-/// contents of the block are copied over.
-///
-bool TailDup::shouldEliminateUnconditionalBranch(TerminatorInst *TI,
-                                                 unsigned Threshold) {
-  BranchInst *BI = dyn_cast<BranchInst>(TI);
-  if (!BI || !BI->isUnconditional()) return false;  // Not an uncond branch!
-
-  BasicBlock *Dest = BI->getSuccessor(0);
-  if (Dest == BI->getParent()) return false;        // Do not loop infinitely!
-
-  // Do not inline a block if we will just get another branch to the same block!
-  TerminatorInst *DTI = Dest->getTerminator();
-  if (BranchInst *DBI = dyn_cast<BranchInst>(DTI))
-    if (DBI->isUnconditional() && DBI->getSuccessor(0) == Dest)
-      return false;                                 // Do not loop infinitely!
-
-  // FIXME: DemoteRegToStack cannot yet demote invoke instructions to the stack,
-  // because doing so would require breaking critical edges.  This should be
-  // fixed eventually.
-  if (!DTI->use_empty())
-    return false;
-
-  // Do not bother with blocks with only a single predecessor: simplify
-  // CFG will fold these two blocks together!
-  pred_iterator PI = pred_begin(Dest), PE = pred_end(Dest);
-  ++PI;
-  if (PI == PE) return false;  // Exactly one predecessor!
-
-  BasicBlock::iterator I = Dest->getFirstNonPHI();
-
-  for (unsigned Size = 0; I != Dest->end(); ++I) {
-    if (Size == Threshold) return false;  // The block is too large.
-    
-    // Don't tail duplicate call instructions.  They are very large compared to
-    // other instructions.
-    if (isa<CallInst>(I) || isa<InvokeInst>(I)) return false;
-
-    // Also alloca and malloc.
-    if (isa<AllocaInst>(I)) return false;
-
-    // Some vector instructions can expand into a number of instructions.
-    if (isa<ShuffleVectorInst>(I) || isa<ExtractElementInst>(I) ||
-        isa<InsertElementInst>(I)) return false;
-    
-    // Only count instructions that are not debugger intrinsics.
-    if (!isa<DbgInfoIntrinsic>(I)) ++Size;
-  }
-
-  // Do not tail duplicate a block that has thousands of successors into a block
-  // with a single successor if the block has many other predecessors.  This can
-  // cause an N^2 explosion in CFG edges (and PHI node entries), as seen in
-  // cases that have a large number of indirect gotos.
-  unsigned NumSuccs = DTI->getNumSuccessors();
-  if (NumSuccs > 8) {
-    unsigned TooMany = 128;
-    if (NumSuccs >= TooMany) return false;
-    TooMany = TooMany/NumSuccs;
-    for (; PI != PE; ++PI)
-      if (TooMany-- == 0) return false;
-  }
-  
-  // If this unconditional branch is a fall-through, be careful about
-  // tail duplicating it.  In particular, we don't want to taildup it if the
-  // original block will still be there after taildup is completed: doing so
-  // would eliminate the fall-through, requiring unconditional branches.
-  Function::iterator DestI = Dest;
-  if (&*--DestI == BI->getParent()) {
-    // The uncond branch is a fall-through.  Tail duplication of the block is
-    // will eliminate the fall-through-ness and end up cloning the terminator
-    // at the end of the Dest block.  Since the original Dest block will
-    // continue to exist, this means that one or the other will not be able to
-    // fall through.  One typical example that this helps with is code like:
-    // if (a)
-    //   foo();
-    // if (b)
-    //   foo();
-    // Cloning the 'if b' block into the end of the first foo block is messy.
-    
-    // The messy case is when the fall-through block falls through to other
-    // blocks.  This is what we would be preventing if we cloned the block.
-    DestI = Dest;
-    if (++DestI != Dest->getParent()->end()) {
-      BasicBlock *DestSucc = DestI;
-      // If any of Dest's successors are fall-throughs, don't do this xform.
-      for (succ_iterator SI = succ_begin(Dest), SE = succ_end(Dest);
-           SI != SE; ++SI)
-        if (*SI == DestSucc)
-          return false;
-    }
-  }
-
-  // Finally, check that we haven't redirected to this target block earlier;
-  // there are cases where we loop forever if we don't check this (PR 2323).
-  if (!CycleDetector.insert(Dest))
-    return false;
-
-  return true;
-}
-
-/// FindObviousSharedDomOf - We know there is a branch from SrcBlock to
-/// DestBlock, and that SrcBlock is not the only predecessor of DstBlock.  If we
-/// can find a predecessor of SrcBlock that is a dominator of both SrcBlock and
-/// DstBlock, return it.
-static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock,
-                                          BasicBlock *DstBlock) {
-  // SrcBlock must have a single predecessor.
-  pred_iterator PI = pred_begin(SrcBlock), PE = pred_end(SrcBlock);
-  if (PI == PE || ++PI != PE) return 0;
-
-  BasicBlock *SrcPred = *pred_begin(SrcBlock);
-
-  // Look at the predecessors of DstBlock.  One of them will be SrcBlock.  If
-  // there is only one other pred, get it, otherwise we can't handle it.
-  PI = pred_begin(DstBlock); PE = pred_end(DstBlock);
-  BasicBlock *DstOtherPred = 0;
-  BasicBlock *P = *PI;
-  if (P == SrcBlock) {
-    if (++PI == PE) return 0;
-    DstOtherPred = *PI;
-    if (++PI != PE) return 0;
-  } else {
-    DstOtherPred = P;
-    if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0;
-  }
-
-  // We can handle two situations here: "if then" and "if then else" blocks.  An
-  // 'if then' situation is just where DstOtherPred == SrcPred.
-  if (DstOtherPred == SrcPred)
-    return SrcPred;
-
-  // Check to see if we have an "if then else" situation, which means that
-  // DstOtherPred will have a single predecessor and it will be SrcPred.
-  PI = pred_begin(DstOtherPred); PE = pred_end(DstOtherPred);
-  if (PI != PE && *PI == SrcPred) {
-    if (++PI != PE) return 0;  // Not a single pred.
-    return SrcPred;  // Otherwise, it's an "if then" situation.  Return the if.
-  }
-
-  // Otherwise, this is something we can't handle.
-  return 0;
-}
-
-
-/// eliminateUnconditionalBranch - Clone the instructions from the destination
-/// block into the source block, eliminating the specified unconditional branch.
-/// If the destination block defines values used by successors of the dest
-/// block, we may need to insert PHI nodes.
-///
-void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) {
-  BasicBlock *SourceBlock = Branch->getParent();
-  BasicBlock *DestBlock = Branch->getSuccessor(0);
-  assert(SourceBlock != DestBlock && "Our predicate is broken!");
-
-  DEBUG(dbgs() << "TailDuplication[" << SourceBlock->getParent()->getName()
-        << "]: Eliminating branch: " << *Branch);
-
-  // See if we can avoid duplicating code by moving it up to a dominator of both
-  // blocks.
-  if (BasicBlock *DomBlock = FindObviousSharedDomOf(SourceBlock, DestBlock)) {
-    DEBUG(dbgs() << "Found shared dominator: " << DomBlock->getName() << "\n");
-
-    // If there are non-phi instructions in DestBlock that have no operands
-    // defined in DestBlock, and if the instruction has no side effects, we can
-    // move the instruction to DomBlock instead of duplicating it.
-    BasicBlock::iterator BBI = DestBlock->getFirstNonPHI();
-    while (!isa<TerminatorInst>(BBI)) {
-      Instruction *I = BBI++;
-
-      bool CanHoist = I->isSafeToSpeculativelyExecute() &&
-                      !I->mayReadFromMemory();
-      if (CanHoist) {
-        for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
-          if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(op)))
-            if (OpI->getParent() == DestBlock ||
-                (isa<InvokeInst>(OpI) && OpI->getParent() == DomBlock)) {
-              CanHoist = false;
-              break;
-            }
-        if (CanHoist) {
-          // Remove from DestBlock, move right before the term in DomBlock.
-          DestBlock->getInstList().remove(I);
-          DomBlock->getInstList().insert(DomBlock->getTerminator(), I);
-          DEBUG(dbgs() << "Hoisted: " << *I);
-        }
-      }
-    }
-  }
-
-  // Tail duplication can not update SSA properties correctly if the values
-  // defined in the duplicated tail are used outside of the tail itself.  For
-  // this reason, we spill all values that are used outside of the tail to the
-  // stack.
-  for (BasicBlock::iterator I = DestBlock->begin(); I != DestBlock->end(); ++I)
-    if (I->isUsedOutsideOfBlock(DestBlock)) {
-      // We found a use outside of the tail.  Create a new stack slot to
-      // break this inter-block usage pattern.
-      DemoteRegToStack(*I);
-    }
-
-  // We are going to have to map operands from the original block B to the new
-  // copy of the block B'.  If there are PHI nodes in the DestBlock, these PHI
-  // nodes also define part of this mapping.  Loop over these PHI nodes, adding
-  // them to our mapping.
-  //
-  std::map<Value*, Value*> ValueMapping;
-
-  BasicBlock::iterator BI = DestBlock->begin();
-  bool HadPHINodes = isa<PHINode>(BI);
-  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-    ValueMapping[PN] = PN->getIncomingValueForBlock(SourceBlock);
-
-  // Clone the non-phi instructions of the dest block into the source block,
-  // keeping track of the mapping...
-  //
-  for (; BI != DestBlock->end(); ++BI) {
-    Instruction *New = BI->clone();
-    New->setName(BI->getName());
-    SourceBlock->getInstList().push_back(New);
-    ValueMapping[BI] = New;
-  }
-
-  // Now that we have built the mapping information and cloned all of the
-  // instructions (giving us a new terminator, among other things), walk the new
-  // instructions, rewriting references of old instructions to use new
-  // instructions.
-  //
-  BI = Branch; ++BI;  // Get an iterator to the first new instruction
-  for (; BI != SourceBlock->end(); ++BI)
-    for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
-      std::map<Value*, Value*>::const_iterator I =
-        ValueMapping.find(BI->getOperand(i));
-      if (I != ValueMapping.end())
-        BI->setOperand(i, I->second);
-    }
-
-  // Next we check to see if any of the successors of DestBlock had PHI nodes.
-  // If so, we need to add entries to the PHI nodes for SourceBlock now.
-  for (succ_iterator SI = succ_begin(DestBlock), SE = succ_end(DestBlock);
-       SI != SE; ++SI) {
-    BasicBlock *Succ = *SI;
-    for (BasicBlock::iterator PNI = Succ->begin(); isa<PHINode>(PNI); ++PNI) {
-      PHINode *PN = cast<PHINode>(PNI);
-      // Ok, we have a PHI node.  Figure out what the incoming value was for the
-      // DestBlock.
-      Value *IV = PN->getIncomingValueForBlock(DestBlock);
-
-      // Remap the value if necessary...
-      std::map<Value*, Value*>::const_iterator I = ValueMapping.find(IV);
-      if (I != ValueMapping.end())
-        IV = I->second;
-      PN->addIncoming(IV, SourceBlock);
-    }
-  }
-
-  // Next, remove the old branch instruction, and any PHI node entries that we
-  // had.
-  BI = Branch; ++BI;  // Get an iterator to the first new instruction
-  DestBlock->removePredecessor(SourceBlock); // Remove entries in PHI nodes...
-  SourceBlock->getInstList().erase(Branch);  // Destroy the uncond branch...
-
-  // Final step: now that we have finished everything up, walk the cloned
-  // instructions one last time, constant propagating and DCE'ing them, because
-  // they may not be needed anymore.
-  //
-  if (HadPHINodes) {
-    while (BI != SourceBlock->end()) {
-      Instruction *Inst = BI++;
-      if (isInstructionTriviallyDead(Inst))
-        Inst->eraseFromParent();
-      else if (Value *V = SimplifyInstruction(Inst)) {
-        Inst->replaceAllUsesWith(V);
-        Inst->eraseFromParent();
-      }
-    }
-  }
-
-  ++NumEliminated;  // We just killed a branch!
-}