78 files changed, 8130 insertions, 6203 deletions
diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
index de1353e..2bb6e90 100644
--- a/lib/Transforms/CMakeLists.txt
+++ b/lib/Transforms/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(Scalar)
 add_subdirectory(IPO)
 add_subdirectory(Vectorize)
 add_subdirectory(Hello)
+add_subdirectory(ObjCARC)
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 385544a..e6fa4ed 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -514,14 +514,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   // Attribute - Keep track of the parameter attributes for the arguments
   // that we are *not* promoting. For the ones that we do promote, the parameter
   // attributes are lost
-  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  SmallVector<AttributeSet, 8> AttributesVec;
   const AttributeSet &PAL = F->getAttributes();
 
   // Add any return attributes.
-  Attribute attrs = PAL.getRetAttributes();
-  if (attrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                    attrs));
+  if (PAL.hasAttributes(AttributeSet::ReturnIndex))
+    AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                              PAL.getRetAttributes()));
 
   // First, determine the new argument list
   unsigned ArgIndex = 1;
@@ -537,9 +536,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     } else if (!ArgsToPromote.count(I)) {
       // Unchanged argument
       Params.push_back(I->getType());
-      Attribute attrs = PAL.getParamAttributes(ArgIndex);
-      if (attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), attrs));
+      AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
+      if (attrs.hasAttributes(ArgIndex)) {
+        AttrBuilder B(attrs, ArgIndex);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
+      }
     } else if (I->use_empty()) {
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
@@ -591,10 +593,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   }
 
   // Add any function attributes.
-  attrs = PAL.getFnAttributes();
-  if (attrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                    attrs));
+  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    AttributesVec.push_back(AttributeSet::get(FTy->getContext(),
+                                              PAL.getFnAttributes()));
 
   Type *RetTy = FTy->getReturnType();
 
@@ -639,10 +640,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     const AttributeSet &CallPAL = CS.getAttributes();
 
     // Add any return attributes.
-    Attribute attrs = CallPAL.getRetAttributes();
-    if (attrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                      attrs));
+    if (CallPAL.hasAttributes(AttributeSet::ReturnIndex))
+      AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                                CallPAL.getRetAttributes()));
 
     // Loop over the operands, inserting GEP and loads in the caller as
     // appropriate.
@@ -653,10 +653,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
         Args.push_back(*AI);          // Unmodified argument
 
-        Attribute Attrs = CallPAL.getParamAttributes(ArgIndex);
-        if (Attrs.hasAttributes())
-          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
-
+        if (CallPAL.hasAttributes(ArgIndex)) {
+          AttrBuilder B(CallPAL, ArgIndex);
+          AttributesVec.
+            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+        }
       } else if (ByValArgsToTransform.count(I)) {
         // Emit a GEP and load for each element of the struct.
         Type *AgTy = cast<PointerType>(I->getType())->getElementType();
@@ -715,16 +716,17 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     // Push any varargs arguments on the list.
     for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
       Args.push_back(*AI);
-      Attribute Attrs = CallPAL.getParamAttributes(ArgIndex);
-      if (Attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+      if (CallPAL.hasAttributes(ArgIndex)) {
+        AttrBuilder B(CallPAL, ArgIndex);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+      }
     }
 
     // Add any function attributes.
-    attrs = CallPAL.getFnAttributes();
-    if (attrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                      attrs));
+    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
+      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
+                                                CallPAL.getFnAttributes()));
 
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index ff040e7..49ef1e7 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -272,14 +272,13 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
 
     // Drop any attributes that were on the vararg arguments.
     AttributeSet PAL = CS.getAttributes();
-    if (!PAL.isEmpty() && PAL.getSlot(PAL.getNumSlots() - 1).Index > NumArgs) {
-      SmallVector<AttributeWithIndex, 8> AttributesVec;
-      for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i)
-        AttributesVec.push_back(PAL.getSlot(i));
-      Attribute FnAttrs = PAL.getFnAttributes();
-      if (FnAttrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                        FnAttrs));
+    if (!PAL.isEmpty() && PAL.getSlotIndex(PAL.getNumSlots() - 1) > NumArgs) {
+      SmallVector<AttributeSet, 8> AttributesVec;
+      for (unsigned i = 0; PAL.getSlotIndex(i) <= NumArgs; ++i)
+        AttributesVec.push_back(PAL.getSlotAttributes(i));
+      if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+        AttributesVec.push_back(AttributeSet::get(Fn.getContext(),
+                                                  PAL.getFnAttributes()));
       PAL = AttributeSet::get(Fn.getContext(), AttributesVec);
     }
 
@@ -351,7 +350,7 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   if (Fn.use_empty())
     return false;
 
-  llvm::SmallVector<unsigned, 8> UnusedArgs;
+  SmallVector<unsigned, 8> UnusedArgs;
   for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(); 
        I != E; ++I) {
     Argument *Arg = I;
@@ -697,15 +696,10 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   std::vector<Type*> Params;
 
   // Set up to build a new list of parameter attributes.
-  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  SmallVector<AttributeSet, 8> AttributesVec;
   const AttributeSet &PAL = F->getAttributes();
 
-  // The existing function return attributes.
-  Attribute RAttrs = PAL.getRetAttributes();
-  Attribute FnAttrs = PAL.getFnAttributes();
-
   // Find out the new return value.
-
   Type *RetTy = FTy->getReturnType();
   Type *NRetTy = NULL;
   unsigned RetCount = NumRetVals(F);
@@ -759,22 +753,29 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
   assert(NRetTy && "No new return type found?");
 
+  // The existing function return attributes.
+  AttributeSet RAttrs = PAL.getRetAttributes();
+
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
   // here. Currently, this should not be possible, but special handling might be
   // required when new return value attributes are added.
   if (NRetTy->isVoidTy())
     RAttrs =
-      Attribute::get(NRetTy->getContext(), AttrBuilder(RAttrs).
-                      removeAttributes(Attribute::typeIncompatible(NRetTy)));
+      AttributeSet::get(NRetTy->getContext(), AttributeSet::ReturnIndex,
+                        AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+         removeAttributes(AttributeFuncs::
+                          typeIncompatible(NRetTy, AttributeSet::ReturnIndex),
+                          AttributeSet::ReturnIndex));
   else
-    assert(!AttrBuilder(RAttrs).
-             hasAttributes(Attribute::typeIncompatible(NRetTy)) &&
+    assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+             hasAttributes(AttributeFuncs::
+                           typeIncompatible(NRetTy, AttributeSet::ReturnIndex),
+                           AttributeSet::ReturnIndex) &&
            "Return attributes no longer compatible?");
 
-  if (RAttrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                    RAttrs));
+  if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
+    AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs));
 
   // Remember which arguments are still alive.
   SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
@@ -791,9 +792,11 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
       // Get the original parameter attributes (skipping the first one, that is
       // for the return value.
-      Attribute Attrs = PAL.getParamAttributes(i + 1);
-      if (Attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), Attrs));
+      if (PAL.hasAttributes(i + 1)) {
+        AttrBuilder B(PAL, i + 1);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
+      }
     } else {
       ++NumArgumentsEliminated;
       DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName()
@@ -801,9 +804,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     }
   }
 
-  if (FnAttrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                    FnAttrs));
+  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                              PAL.getFnAttributes()));
 
   // Reconstruct the AttributesList based on the vector we constructed.
   AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec);
@@ -836,15 +839,18 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     const AttributeSet &CallPAL = CS.getAttributes();
 
     // The call return attributes.
-    Attribute RAttrs = CallPAL.getRetAttributes();
-    Attribute FnAttrs = CallPAL.getFnAttributes();
+    AttributeSet RAttrs = CallPAL.getRetAttributes();
+
     // Adjust in case the function was changed to return void.
     RAttrs =
-      Attribute::get(NF->getContext(), AttrBuilder(RAttrs).
-           removeAttributes(Attribute::typeIncompatible(NF->getReturnType())));
-    if (RAttrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                      RAttrs));
+      AttributeSet::get(NF->getContext(), AttributeSet::ReturnIndex,
+                        AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+        removeAttributes(AttributeFuncs::
+                         typeIncompatible(NF->getReturnType(),
+                                          AttributeSet::ReturnIndex),
+                         AttributeSet::ReturnIndex));
+    if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
+      AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs));
 
     // Declare these outside of the loops, so we can reuse them for the second
     // loop, which loops the varargs.
@@ -856,22 +862,26 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[i]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        Attribute Attrs = CallPAL.getParamAttributes(i + 1);
-        if (Attrs.hasAttributes())
-          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+        if (CallPAL.hasAttributes(i + 1)) {
+          AttrBuilder B(CallPAL, i + 1);
+          AttributesVec.
+            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+        }
       }
 
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
       Args.push_back(*I);
-      Attribute Attrs = CallPAL.getParamAttributes(i + 1);
-      if (Attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+      if (CallPAL.hasAttributes(i + 1)) {
+        AttrBuilder B(CallPAL, i + 1);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+      }
     }
 
-    if (FnAttrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                      FnAttrs));
+    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
+      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
+                                                CallPAL.getFnAttributes()));
 
     // Reconstruct the AttributesList based on the vector we constructed.
     AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec);
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index e9bc4ad..a75212a 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -215,14 +215,13 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
     AttrBuilder B;
     B.addAttribute(Attribute::ReadOnly)
       .addAttribute(Attribute::ReadNone);
-    F->removeAttribute(AttributeSet::FunctionIndex,
-                       Attribute::get(F->getContext(), B));
+    F->removeAttributes(AttributeSet::FunctionIndex,
+                        AttributeSet::get(F->getContext(),
+                                          AttributeSet::FunctionIndex, B));
 
     // Add in the new attribute.
-    B.clear();
-    B.addAttribute(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
     F->addAttribute(AttributeSet::FunctionIndex,
-                    Attribute::get(F->getContext(), B));
+                    ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
 
     if (ReadsMemory)
       ++NumReadOnly;
@@ -381,7 +380,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
       for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end();
            A != E; ++A) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
-          A->addAttr(Attribute::get(F->getContext(), B));
+          A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
           ++NumNoCapture;
           Changed = true;
         }
@@ -396,7 +395,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
         if (!Tracker.Captured) {
           if (Tracker.Uses.empty()) {
             // If it's trivially not captured, mark it nocapture now.
-            A->addAttr(Attribute::get(F->getContext(), B));
+            A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo()+1, B));
             ++NumNoCapture;
             Changed = true;
           } else {
@@ -431,7 +430,9 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
           ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
         ArgumentSCC[0]->
           Definition->
-          addAttr(Attribute::get(ArgumentSCC[0]->Definition->getContext(), B));
+          addAttr(AttributeSet::get(ArgumentSCC[0]->Definition->getContext(),
+                                    ArgumentSCC[0]->Definition->getArgNo() + 1,
+                                    B));
         ++NumNoCapture;
         Changed = true;
       }
@@ -473,7 +474,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
 
     for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
       Argument *A = ArgumentSCC[i]->Definition;
-      A->addAttr(Attribute::get(A->getContext(), B));
+      A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
       ++NumNoCapture;
       Changed = true;
     }
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index abd37c2..2b9d667 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -448,8 +448,8 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
       Dead[i].second->eraseFromParent();
       Instruction *I = Dead[i].first;
       do {
-	if (isAllocationFn(I, TLI))
-	  break;
+        if (isAllocationFn(I, TLI))
+          break;
         Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
         if (!J)
           break;
@@ -1825,7 +1825,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
                                              GlobalValue::InternalLinkage,
                                         ConstantInt::getFalse(GV->getContext()),
                                              GV->getName()+".b",
-                                             GV->getThreadLocalMode());
+                                             GV->getThreadLocalMode(),
+                                             GV->getType()->getAddressSpace());
   GV->getParent()->getGlobalList().insert(GV, NewGV);
 
   Constant *InitVal = GV->getInitializer();
@@ -1845,10 +1846,10 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
       bool StoringOther = SI->getOperand(0) == OtherVal;
       // Only do this if we weren't storing a loaded value.
       Value *StoreVal;
-      if (StoringOther || SI->getOperand(0) == InitVal)
+      if (StoringOther || SI->getOperand(0) == InitVal) {
         StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
                                     StoringOther);
-      else {
+      } else {
         // Otherwise, we are storing a previously loaded copy.  To do this,
         // change the copy from copying the original value to just copying the
         // bool.
@@ -1887,6 +1888,9 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
     UI->eraseFromParent();
   }
 
+  // Retain the name of the old global variable. People who are debugging their
+  // programs may expect these variables to be named the same.
+  NewGV->takeName(GV);
   GV->eraseFromParent();
   return true;
 }
@@ -1989,7 +1993,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     return Changed;
 
   } else if (GS.StoredType <= GlobalStatus::isInitializerStored) {
-    DEBUG(dbgs() << "MARKING CONSTANT: " << *GV);
+    DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
     GV->setConstant(true);
 
     // Clean up any obviously simplifiable users now.
@@ -2067,12 +2071,12 @@ static void ChangeCalleesToFastCall(Function *F) {
 
 static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) {
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
-    if (!Attrs.getSlot(i).Attrs.hasAttribute(Attribute::Nest))
+    unsigned Index = Attrs.getSlotIndex(i);
+    if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest))
       continue;
 
     // There can be only one.
-    return Attrs.removeAttr(C, Attrs.getSlot(i).Index,
-                            Attribute::get(C, Attribute::Nest));
+    return Attrs.removeAttribute(C, Index, Attribute::Nest);
   }
 
   return Attrs;
@@ -2584,24 +2588,38 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
   while (1) {
     Constant *InstResult = 0;
 
+    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
-      if (!SI->isSimple()) return false;  // no volatile/atomic accesses.
+      if (!SI->isSimple()) {
+        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
       Constant *Ptr = getVal(SI->getOperand(1));
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
         Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
-      if (!isSimpleEnoughPointerToCommit(Ptr))
+        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+      }
+      if (!isSimpleEnoughPointerToCommit(Ptr)) {
         // If this is too complex for us to commit, reject it.
+        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
         return false;
+      }
 
       Constant *Val = getVal(SI->getOperand(0));
 
       // If this might be too difficult for the backend to handle (e.g. the addr
       // of one global variable divided by another) then we can't commit it.
-      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD))
+      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD)) {
+        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
+              << "\n");
         return false;
+      }
 
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
         if (CE->getOpcode() == Instruction::BitCast) {
+          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
           // If we're evaluating a store through a bitcast, then we need
           // to pull the bitcast off the pointer type and push it onto the
           // stored value.
@@ -2630,6 +2648,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
             // If we can't improve the situation by introspecting NewTy,
             // we have to give up.
             } else {
+              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                    "evaluate.\n");
               return false;
             }
           }
@@ -2637,25 +2657,36 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           // If we found compatible types, go ahead and push the bitcast
           // onto the stored value.
           Val = ConstantExpr::getBitCast(Val, NewTy);
+
+          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
         }
+      }
 
       MutatedMemory[Ptr] = Val;
     } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
       InstResult = ConstantExpr::get(BO->getOpcode(),
                                      getVal(BO->getOperand(0)),
                                      getVal(BO->getOperand(1)));
+      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
+            << "\n");
     } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
       InstResult = ConstantExpr::getCompare(CI->getPredicate(),
                                             getVal(CI->getOperand(0)),
                                             getVal(CI->getOperand(1)));
+      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+            << "\n");
     } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
       InstResult = ConstantExpr::getCast(CI->getOpcode(),
                                          getVal(CI->getOperand(0)),
                                          CI->getType());
+      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+            << "\n");
     } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
       InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
                                            getVal(SI->getOperand(1)),
                                            getVal(SI->getOperand(2)));
+      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+            << "\n");
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
       Constant *P = getVal(GEP->getOperand(0));
       SmallVector<Constant*, 8> GEPOps;
@@ -2665,41 +2696,70 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       InstResult =
         ConstantExpr::getGetElementPtr(P, GEPOps,
                                        cast<GEPOperator>(GEP)->isInBounds());
+      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
+            << "\n");
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
-      if (!LI->isSimple()) return false;  // no volatile/atomic accesses.
+
+      if (!LI->isSimple()) {
+        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+
       Constant *Ptr = getVal(LI->getOperand(0));
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
         Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
+        DEBUG(dbgs() << "Found a constant pointer expression, constant "
+              "folding: " << *Ptr << "\n");
+      }
       InstResult = ComputeLoadResult(Ptr);
-      if (InstResult == 0) return false; // Could not evaluate load.
+      if (InstResult == 0) {
+        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
+              "\n");
+        return false; // Could not evaluate load.
+      }
+
+      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
     } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
-      if (AI->isArrayAllocation()) return false;  // Cannot handle array allocs.
+      if (AI->isArrayAllocation()) {
+        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        return false;  // Cannot handle array allocs.
+      }
       Type *Ty = AI->getType()->getElementType();
       AllocaTmps.push_back(new GlobalVariable(Ty, false,
                                               GlobalValue::InternalLinkage,
                                               UndefValue::get(Ty),
                                               AI->getName()));
       InstResult = AllocaTmps.back();
+      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
       CallSite CS(CurInst);
 
       // Debug info can safely be ignored here.
       if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
+        DEBUG(dbgs() << "Ignoring debug info.\n");
         ++CurInst;
         continue;
       }
 
       // Cannot handle inline asm.
-      if (isa<InlineAsm>(CS.getCalledValue())) return false;
+      if (isa<InlineAsm>(CS.getCalledValue())) {
+        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        return false;
+      }
 
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
         if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
-          if (MSI->isVolatile()) return false;
+          if (MSI->isVolatile()) {
+            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
+                  "intrinsic.\n");
+            return false;
+          }
           Constant *Ptr = getVal(MSI->getDest());
           Constant *Val = getVal(MSI->getValue());
           Constant *DestVal = ComputeLoadResult(getVal(Ptr));
           if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
             // This memset is a no-op.
+            DEBUG(dbgs() << "Ignoring no-op memset.\n");
             ++CurInst;
             continue;
           }
@@ -2707,6 +2767,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
 
         if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
             II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
           ++CurInst;
           continue;
         }
@@ -2714,8 +2775,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         if (II->getIntrinsicID() == Intrinsic::invariant_start) {
           // We don't insert an entry into Values, as it doesn't have a
           // meaningful return value.
-          if (!II->use_empty())
+          if (!II->use_empty()) {
+            DEBUG(dbgs() << "Found unused invariant_start. Cant evaluate.\n");
             return false;
+          }
           ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
           Value *PtrArg = getVal(II->getArgOperand(1));
           Value *Ptr = PtrArg->stripPointerCasts();
@@ -2723,20 +2786,30 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
             Type *ElemTy = cast<PointerType>(GV->getType())->getElementType();
             if (!Size->isAllOnesValue() &&
                 Size->getValue().getLimitedValue() >=
-                TD->getTypeStoreSize(ElemTy))
+                TD->getTypeStoreSize(ElemTy)) {
               Invariants.insert(GV);
+              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
+                    << "\n");
+            } else {
+              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
+                    "invariant.\n");
+            }
           }
           // Continue even if we do nothing.
           ++CurInst;
           continue;
         }
+
+        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
         return false;
       }
 
       // Resolve function pointers.
       Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
-      if (!Callee || Callee->mayBeOverridden())
+      if (!Callee || Callee->mayBeOverridden()) {
+        DEBUG(dbgs() << "Can not resolve function pointer.\n");
         return false;  // Cannot resolve.
+      }
 
       SmallVector<Constant*, 8> Formals;
       for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
@@ -2746,22 +2819,38 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         // If this is a function we can constant fold, do it.
         if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) {
           InstResult = C;
+          DEBUG(dbgs() << "Constant folded function call. Result: " <<
+                *InstResult << "\n");
         } else {
+          DEBUG(dbgs() << "Can not constant fold function call.\n");
           return false;
         }
       } else {
-        if (Callee->getFunctionType()->isVarArg())
+        if (Callee->getFunctionType()->isVarArg()) {
+          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
           return false;
+        }
 
-        Constant *RetVal;
+        Constant *RetVal = 0;
         // Execute the call, if successful, use the return value.
         ValueStack.push_back(new DenseMap<Value*, Constant*>);
-        if (!EvaluateFunction(Callee, RetVal, Formals))
+        if (!EvaluateFunction(Callee, RetVal, Formals)) {
+          DEBUG(dbgs() << "Failed to evaluate function.\n");
           return false;
+        }
         delete ValueStack.pop_back_val();
         InstResult = RetVal;
+
+        if (InstResult != NULL) {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: " <<
+                InstResult << "\n\n");
+        } else {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+        }
       }
     } else if (isa<TerminatorInst>(CurInst)) {
+      DEBUG(dbgs() << "Found a terminator instruction.\n");
+
       if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
         if (BI->isUnconditional()) {
           NextBB = BI->getSuccessor(0);
@@ -2787,13 +2876,17 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         NextBB = 0;
       } else {
         // invoke, unwind, resume, unreachable.
+        DEBUG(dbgs() << "Can not handle terminator.");
         return false;  // Cannot handle this terminator.
       }
 
       // We succeeded at evaluating this block!
+      DEBUG(dbgs() << "Successfully evaluated block.\n");
       return true;
     } else {
       // Did not know how to evaluate this!
+      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
+            "\n");
       return false;
     }
 
@@ -2807,6 +2900,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
     // If we just processed an invoke, we finished evaluating the block.
     if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
       NextBB = II->getNormalDest();
+      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
       return true;
     }
 
@@ -2845,6 +2939,8 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
 
   while (1) {
     BasicBlock *NextBB = 0; // Initialized to avoid compiler warnings.
+    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+
     if (!EvaluateBlock(CurInst, NextBB))
       return false;
 
@@ -2924,6 +3020,7 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
       }
       break;
     }
+    DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
 
     // We cannot simplify external ctor functions.
     if (F->empty()) continue;
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 2971803..a0095da 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -30,35 +30,41 @@ using namespace llvm;
 
 namespace {
 
-  // AlwaysInliner only inlines functions that are mark as "always inline".
-  class AlwaysInliner : public Inliner {
-    InlineCostAnalyzer CA;
-  public:
-    // Use extremely low threshold.
-    AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/true) {
-      initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    AlwaysInliner(bool InsertLifetime) : Inliner(ID, -2000000000,
-                                                 InsertLifetime) {
-      initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    static char ID; // Pass identification, replacement for typeid
-    virtual InlineCost getInlineCost(CallSite CS);
-
-    using llvm::Pass::doInitialization;
-    using llvm::Pass::doFinalization;
-
-    virtual bool doFinalization(CallGraph &CG) {
-      return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
-    }
-    virtual bool doInitialization(CallGraph &CG);
-  };
+/// \brief Inliner pass which only handles "always inline" functions.
+class AlwaysInliner : public Inliner {
+  InlineCostAnalysis *ICA;
+
+public:
+  // Use extremely low threshold.
+  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), ICA(0) {
+    initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  AlwaysInliner(bool InsertLifetime)
+      : Inliner(ID, -2000000000, InsertLifetime), ICA(0) {
+    initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  virtual InlineCost getInlineCost(CallSite CS);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual bool runOnSCC(CallGraphSCC &SCC);
+
+  using llvm::Pass::doFinalization;
+  virtual bool doFinalization(CallGraph &CG) {
+    return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true);
+  }
+};
+
 }
 
 char AlwaysInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
 INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
 
@@ -89,15 +95,18 @@ InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
   if (Callee && !Callee->isDeclaration() &&
       Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                            Attribute::AlwaysInline) &&
-      CA.isInlineViable(*Callee))
+      ICA->isInlineViable(*Callee))
     return InlineCost::getAlways();
 
   return InlineCost::getNever();
 }
 
-// doInitialization - Initializes the vector of functions that have not
-// been annotated with the "always inline" attribute.
-bool AlwaysInliner::doInitialization(CallGraph &CG) {
-  CA.setDataLayout(getAnalysisIfAvailable<DataLayout>());
-  return false;
+bool AlwaysInliner::runOnSCC(CallGraphSCC &SCC) {
+  ICA = &getAnalysis<InlineCostAnalysis>();
+  return Inliner::runOnSCC(SCC);
+}
+
+void AlwaysInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<InlineCostAnalysis>();
+  Inliner::getAnalysisUsage(AU);
 }
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 9682923..a4f7026 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -28,29 +28,41 @@ using namespace llvm;
 
 namespace {
 
-  class SimpleInliner : public Inliner {
-    InlineCostAnalyzer CA;
-  public:
-    SimpleInliner() : Inliner(ID) {
-      initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    SimpleInliner(int Threshold) : Inliner(ID, Threshold,
-                                           /*InsertLifetime*/true) {
-      initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    static char ID; // Pass identification, replacement for typeid
-    InlineCost getInlineCost(CallSite CS) {
-      return CA.getInlineCost(CS, getInlineThreshold(CS));
-    }
-    using llvm::Pass::doInitialization;
-    virtual bool doInitialization(CallGraph &CG);
-  };
-}
+/// \brief Actaul inliner pass implementation.
+///
+/// The common implementation of the inlining logic is shared between this
+/// inliner pass and the always inliner pass. The two passes use different cost
+/// analyses to determine when to inline.
+class SimpleInliner : public Inliner {
+  InlineCostAnalysis *ICA;
+
+public:
+  SimpleInliner() : Inliner(ID), ICA(0) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  SimpleInliner(int Threshold)
+      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(0) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  InlineCost getInlineCost(CallSite CS) {
+    return ICA->getInlineCost(CS, getInlineThreshold(CS));
+  }
+
+  virtual bool runOnSCC(CallGraphSCC &SCC);
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+};
+
+} // end anonymous namespace
 
 char SimpleInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
 INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
 
@@ -60,10 +72,12 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
   return new SimpleInliner(Threshold);
 }
 
-// doInitialization - Initializes the vector of functions that have been
-// annotated with the noinline attribute.
-bool SimpleInliner::doInitialization(CallGraph &CG) {
-  CA.setDataLayout(getAnalysisIfAvailable<DataLayout>());
-  return false;
+bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
+  ICA = &getAnalysis<InlineCostAnalysis>();
+  return Inliner::runOnSCC(SCC);
 }
 
+void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<InlineCostAnalysis>();
+  Inliner::getAnalysisUsage(AU);
+}
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 2187a2a..663ddb7 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -72,6 +72,40 @@ void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
 typedef DenseMap<ArrayType*, std::vector<AllocaInst*> >
 InlinedArrayAllocasTy;
 
+/// \brief If the inlined function had a higher stack protection level than the
+/// calling function, then bump up the caller's stack protection level.
+static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {
+  // If upgrading the SSP attribute, clear out the old SSP Attributes first.
+  // Having multiple SSP attributes doesn't actually hurt, but it adds useless
+  // clutter to the IR.
+  AttrBuilder B;
+  B.addAttribute(Attribute::StackProtect)
+    .addAttribute(Attribute::StackProtectStrong);
+  AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(),
+                                              AttributeSet::FunctionIndex,
+                                              B);
+  AttributeSet CallerAttr = Caller->getAttributes(),
+               CalleeAttr = Callee->getAttributes();
+
+  if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                              Attribute::StackProtectReq)) {
+    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller->addFnAttr(Attribute::StackProtectReq);
+  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackProtectStrong) &&
+             !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::StackProtectReq)) {
+    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller->addFnAttr(Attribute::StackProtectStrong);
+  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackProtect) &&
+           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                    Attribute::StackProtectReq) &&
+           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                    Attribute::StackProtectStrong))
+    Caller->addFnAttr(Attribute::StackProtect);
+}
+
 /// InlineCallIfPossible - If it is possible to inline the specified call site,
 /// do so and update the CallGraph for this operation.
 ///
@@ -91,16 +125,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
   if (!InlineFunction(CS, IFI, InsertLifetime))
     return false;
 
-  // If the inlined function had a higher stack protection level than the
-  // calling function, then bump up the caller's stack protection level.
-  if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::StackProtectReq))
-    Caller->addFnAttr(Attribute::StackProtectReq);
-  else if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                Attribute::StackProtect) &&
-           !Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                 Attribute::StackProtectReq))
-    Caller->addFnAttr(Attribute::StackProtect);
+  AdjustCallerSSPLevel(Caller, Callee);
 
   // Look at all of the allocas that we inlined through this call site.  If we
   // have already inlined other allocas through other calls into this function,
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index 70d55b0..4bfab5b 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -50,6 +50,8 @@ namespace {
     explicit InternalizePass();
     explicit InternalizePass(ArrayRef<const char *> exportList);
     void LoadFile(const char *Filename);
+    void ClearExportList();
+    void AddToExportList(const std::string &val);
     virtual bool runOnModule(Module &M);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -97,6 +99,14 @@ void InternalizePass::LoadFile(const char *Filename) {
   }
 }
 
+void InternalizePass::ClearExportList() {
+  ExternalNames.clear();
+}
+
+void InternalizePass::AddToExportList(const std::string &val) {
+  ExternalNames.insert(val);
+}
+
 bool InternalizePass::runOnModule(Module &M) {
   CallGraph *CG = getAnalysisIfAvailable<CallGraph>();
   CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt
index b18c915..124cbb6 100644
--- a/lib/Transforms/IPO/LLVMBuild.txt
+++ b/lib/Transforms/IPO/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = IPO
 parent = Transforms
 library_name = ipo
-required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils
+required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils ObjCARC
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 6dc1773..47b2b51 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -214,6 +214,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
       MPM.add(createGVNPass());                   // Remove redundancies
     else
       MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+
+    // BBVectorize may have significantly shortened a loop body; unroll again.
+    if (!DisableUnrollLoops)
+      MPM.add(createLoopUnrollPass());
   }
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index d872f0c..73d9323 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -146,9 +146,11 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
 
       Function *F = (*I)->getFunction();
       const AttributeSet &PAL = F->getAttributes();
-      const AttributeSet &NPAL = PAL.addAttr(F->getContext(), ~0,
-                                            Attribute::get(F->getContext(),
-                                                            NewAttributes));
+      const AttributeSet &NPAL =
+        PAL.addAttributes(F->getContext(), AttributeSet::FunctionIndex,
+                          AttributeSet::get(F->getContext(),
+                                            AttributeSet::FunctionIndex,
+                                            NewAttributes));
       if (PAL != NPAL) {
         MadeChange = true;
         F->setAttributes(NPAL);
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 959daa2..1f6a3a5e 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -27,7 +27,7 @@ namespace llvm {
   class DbgDeclareInst;
   class MemIntrinsic;
   class MemSetInst;
-  
+
 /// SelectPatternFlavor - We can match a variety of different patterns for
 /// select operations.
 enum SelectPatternFlavor {
@@ -36,7 +36,7 @@ enum SelectPatternFlavor {
   SPF_SMAX, SPF_UMAX
   //SPF_ABS - TODO.
 };
-  
+
 /// getComplexity:  Assign a complexity or rank value to LLVM Values...
 ///   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
 static inline unsigned getComplexity(Value *V) {
@@ -51,23 +51,23 @@ static inline unsigned getComplexity(Value *V) {
   return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
 }
 
-  
+
 /// InstCombineIRInserter - This is an IRBuilder insertion helper that works
 /// just like the normal insertion helper, but also adds any new instructions
 /// to the instcombine worklist.
-class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter 
+class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
     : public IRBuilderDefaultInserter<true> {
   InstCombineWorklist &Worklist;
 public:
   InstCombineIRInserter(InstCombineWorklist &WL) : Worklist(WL) {}
-  
+
   void InsertHelper(Instruction *I, const Twine &Name,
                     BasicBlock *BB, BasicBlock::iterator InsertPt) const {
     IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
     Worklist.Add(I);
   }
 };
-  
+
 /// InstCombiner - The -instcombine pass.
 class LLVM_LIBRARY_VISIBILITY InstCombiner
                              : public FunctionPass,
@@ -85,7 +85,7 @@ public:
   /// instructions into the worklist when they are created.
   typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy;
   BuilderTy *Builder;
-      
+
   static char ID; // Pass identification, replacement for typeid
   InstCombiner() : FunctionPass(ID), TD(0), Builder(0) {
     MinimizeSize = false;
@@ -94,7 +94,7 @@ public:
 
 public:
   virtual bool runOnFunction(Function &F);
-  
+
   bool DoOneIteration(Function &F, unsigned ItNum);
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
@@ -211,11 +211,11 @@ public:
 private:
   bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
-  Value *dyn_castFNegVal(Value *V) const;
-  Type *FindElementAtOffset(Type *Ty, int64_t Offset, 
+  Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const;
+  Type *FindElementAtOffset(Type *Ty, int64_t Offset,
                                   SmallVectorImpl<Value*> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
-                                 
+
   /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
   /// results in any code being generated and is interesting to optimize out. If
   /// the cast can be eliminated by some other simple transformation, we prefer
@@ -247,7 +247,7 @@ public:
     return New;
   }
 
-  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the 
+  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the
   // debug loc.
   //
   Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
@@ -263,10 +263,10 @@ public:
   //
   Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
     Worklist.AddUsersToWorkList(I);   // Add all modified instrs to worklist.
-    
+
     // If we are replacing the instruction with itself, this must be in a
     // segment of unreachable code, so just clobber the instruction.
-    if (&I == V) 
+    if (&I == V)
       V = UndefValue::get(I.getType());
 
     DEBUG(errs() << "IC: Replacing " << I << "\n"
@@ -296,13 +296,13 @@ public:
     MadeIRChange = true;
     return 0;  // Don't do anything with FI
   }
-      
+
   void ComputeMaskedBits(Value *V, APInt &KnownZero,
                          APInt &KnownOne, unsigned Depth = 0) const {
     return llvm::ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
   }
-  
-  bool MaskedValueIsZero(Value *V, const APInt &Mask, 
+
+  bool MaskedValueIsZero(Value *V, const APInt &Mask,
                          unsigned Depth = 0) const {
     return llvm::MaskedValueIsZero(V, Mask, TD, Depth);
   }
@@ -325,10 +325,10 @@ private:
 
   /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
   /// based on the demanded bits.
-  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, 
+  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                  APInt& KnownZero, APInt& KnownOne,
                                  unsigned Depth);
-  bool SimplifyDemandedBits(Use &U, APInt DemandedMask, 
+  bool SimplifyDemandedBits(Use &U, APInt DemandedMask,
                             APInt& KnownZero, APInt& KnownOne,
                             unsigned Depth=0);
   /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
@@ -336,15 +336,15 @@ private:
   Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
                                     APInt DemandedMask, APInt &KnownZero,
                                     APInt &KnownOne);
-      
+
   /// SimplifyDemandedInstructionBits - Inst is an integer instruction that
   /// SimplifyDemandedBits knows about.  See if the instruction has any
   /// properties that allow us to simplify its operands.
   bool SimplifyDemandedInstructionBits(Instruction &Inst);
-      
+
   Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                     APInt& UndefElts, unsigned Depth = 0);
-    
+
   // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
   // which has a PHI node as operand #0, see if we can fold the instruction
   // into the PHI (which is only possible if all operands to the PHI are
@@ -360,10 +360,10 @@ private:
   Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
   Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
 
-  
+
   Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
                         ConstantInt *AndRHS, BinaryOperator &TheAnd);
-  
+
   Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
                             bool isSub, Instruction &I);
   Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
@@ -382,8 +382,8 @@ private:
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
 };
 
-      
-  
+
+
 } // end namespace llvm.
 
 #endif
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index f07c58d..c6d60d6 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -66,10 +66,12 @@ namespace {
     bool insaneIntVal(int V) { return V > 4 || V < -4; }
     APFloat *getFpValPtr(void)
       { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); }
+    const APFloat *getFpValPtr(void) const
+      { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); }
 
     const APFloat &getFpVal(void) const {
       assert(IsFp && BufHasFpVal && "Incorret state");
-      return *reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]);
+      return *getFpValPtr();
     }
 
     APFloat &getFpVal(void)
@@ -1248,6 +1250,16 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
     if (SimplifyDemandedInstructionBits(I))
       return &I;
+
+    // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
+    if (C->isZero() && match(Op1, m_ZExt(m_Value(X))))
+      if (X->getType()->isIntegerTy(1))
+        return CastInst::CreateSExtOrBitCast(X, Op1->getType());
+
+    // Fold (sub 0, (sext bool to B)) --> (zext bool to B)
+    if (C->isZero() && match(Op1, m_SExt(m_Value(X))))
+      if (X->getType()->isIntegerTy(1))
+        return CastInst::CreateZExtOrBitCast(X, Op1->getType());
   }
 
 
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index c1e60d4..4332467 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1245,6 +1245,34 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       }
   }
 
+  {
+    Value *X = 0;
+    bool OpsSwapped = false;
+    // Canonicalize SExt or Not to the LHS
+    if (match(Op1, m_SExt(m_Value())) ||
+        match(Op1, m_Not(m_Value()))) {
+      std::swap(Op0, Op1);
+      OpsSwapped = true;
+    }
+
+    // Fold (and (sext bool to A), B) --> (select bool, B, 0)
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        X->getType()->getScalarType()->isIntegerTy(1)) {
+      Value *Zero = Constant::getNullValue(Op1->getType());
+      return SelectInst::Create(X, Op1, Zero);
+    }
+
+    // Fold (and ~(sext bool to A), B) --> (select bool, 0, B)
+    if (match(Op0, m_Not(m_SExt(m_Value(X)))) &&
+        X->getType()->getScalarType()->isIntegerTy(1)) {
+      Value *Zero = Constant::getNullValue(Op0->getType());
+      return SelectInst::Create(X, Zero, Op1);
+    }
+
+    if (OpsSwapped)
+      std::swap(Op0, Op1);
+  }
+
   return Changed ? &I : 0;
 }
 
@@ -2043,6 +2071,20 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     return BinaryOperator::CreateOr(Inner, C1);
   }
 
+  // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D))
+  // Since this OR statement hasn't been optimized further yet, we hope
+  // that this transformation will allow the new ORs to be optimized.
+  {
+    Value *X = 0, *Y = 0;
+    if (Op0->hasOneUse() && Op1->hasOneUse() &&
+        match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
+        match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
+      Value *orTrue = Builder->CreateOr(A, C);
+      Value *orFalse = Builder->CreateOr(B, D);
+      return SelectInst::Create(X, orTrue, orFalse);
+    }
+  }
+
   return Changed ? &I : 0;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d17879b..64cd1bd 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1014,8 +1014,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       return false;   // Cannot transform this return value.
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
-      AttrBuilder RAttrs = CallerPAL.getRetAttributes();
-      if (RAttrs.hasAttributes(Attribute::typeIncompatible(NewRetTy)))
+      AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+      if (RAttrs.
+          hasAttributes(AttributeFuncs::
+                        typeIncompatible(NewRetTy, AttributeSet::ReturnIndex),
+                        AttributeSet::ReturnIndex))
         return false;   // Attribute not compatible with transformed value.
     }
 
@@ -1044,14 +1047,16 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (!CastInst::isCastable(ActTy, ParamTy))
       return false;   // Cannot transform this parameter value.
 
-    Attribute Attrs = CallerPAL.getParamAttributes(i + 1);
-    if (AttrBuilder(Attrs).
-          hasAttributes(Attribute::typeIncompatible(ParamTy)))
+    if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
+          hasAttributes(AttributeFuncs::
+                        typeIncompatible(ParamTy, i + 1), i + 1))
       return false;   // Attribute not compatible with transformed value.
 
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
-    if (ParamTy != ActTy && Attrs.hasAttribute(Attribute::ByVal)) {
+    if (ParamTy != ActTy &&
+        CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
+                                                         Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
@@ -1100,11 +1105,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     // won't be dropping them.  Check that these extra arguments have attributes
     // that are compatible with being a vararg call argument.
     for (unsigned i = CallerPAL.getNumSlots(); i; --i) {
-      if (CallerPAL.getSlot(i - 1).Index <= FT->getNumParams())
+      unsigned Index = CallerPAL.getSlotIndex(i - 1);
+      if (Index <= FT->getNumParams())
         break;
-      Attribute PAttrs = CallerPAL.getSlot(i - 1).Attrs;
+
       // Check if it has an attribute that's incompatible with varargs.
-      if (PAttrs.hasAttribute(Attribute::StructRet))
+      AttributeSet PAttrs = CallerPAL.getSlotAttributes(i - 1);
+      if (PAttrs.hasAttribute(Index, Attribute::StructRet))
         return false;
     }
 
@@ -1113,21 +1120,23 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   // inserting cast instructions as necessary.
   std::vector<Value*> Args;
   Args.reserve(NumActualArgs);
-  SmallVector<AttributeWithIndex, 8> attrVec;
+  SmallVector<AttributeSet, 8> attrVec;
   attrVec.reserve(NumCommonArgs);
 
   // Get any return attributes.
-  AttrBuilder RAttrs = CallerPAL.getRetAttributes();
+  AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
-  RAttrs.removeAttributes(Attribute::typeIncompatible(NewRetTy));
+  RAttrs.
+    removeAttributes(AttributeFuncs::
+                     typeIncompatible(NewRetTy, AttributeSet::ReturnIndex),
+                     AttributeSet::ReturnIndex);
 
   // Add the new return attributes.
   if (RAttrs.hasAttributes())
-    attrVec.push_back(
-      AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                              Attribute::get(FT->getContext(), RAttrs)));
+    attrVec.push_back(AttributeSet::get(Caller->getContext(),
+                                        AttributeSet::ReturnIndex, RAttrs));
 
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
@@ -1141,9 +1150,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     }
 
     // Add any parameter attributes.
-    Attribute PAttrs = CallerPAL.getParamAttributes(i + 1);
+    AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
     if (PAttrs.hasAttributes())
-      attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+      attrVec.push_back(AttributeSet::get(Caller->getContext(), i + 1,
+                                          PAttrs));
   }
 
   // If the function takes more arguments than the call was taking, add them
@@ -1168,23 +1178,23 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         }
 
         // Add any parameter attributes.
-        Attribute PAttrs = CallerPAL.getParamAttributes(i + 1);
+        AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
         if (PAttrs.hasAttributes())
-          attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+          attrVec.push_back(AttributeSet::get(FT->getContext(), i + 1,
+                                              PAttrs));
       }
     }
   }
 
-  Attribute FnAttrs = CallerPAL.getFnAttributes();
-  if (FnAttrs.hasAttributes())
-    attrVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                              FnAttrs));
+  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
+  if (CallerPAL.hasAttributes(AttributeSet::FunctionIndex))
+    attrVec.push_back(AttributeSet::get(Callee->getContext(), FnAttrs));
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
   const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),
-                                                     attrVec);
+                                                       attrVec);
 
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
@@ -1262,12 +1272,12 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   if (!NestAttrs.isEmpty()) {
     unsigned NestIdx = 1;
     Type *NestTy = 0;
-    Attribute NestAttr;
+    AttributeSet NestAttr;
 
     // Look for a parameter marked with the 'nest' attribute.
     for (FunctionType::param_iterator I = NestFTy->param_begin(),
          E = NestFTy->param_end(); I != E; ++NestIdx, ++I)
-      if (NestAttrs.getParamAttributes(NestIdx).hasAttribute(Attribute::Nest)){
+      if (NestAttrs.hasAttribute(NestIdx, Attribute::Nest)) {
         // Record the parameter type and any other attributes.
         NestTy = *I;
         NestAttr = NestAttrs.getParamAttributes(NestIdx);
@@ -1279,17 +1289,16 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       std::vector<Value*> NewArgs;
       NewArgs.reserve(unsigned(CS.arg_end()-CS.arg_begin())+1);
 
-      SmallVector<AttributeWithIndex, 8> NewAttrs;
+      SmallVector<AttributeSet, 8> NewAttrs;
       NewAttrs.reserve(Attrs.getNumSlots() + 1);
 
       // Insert the nest argument into the call argument list, which may
       // mean appending it.  Likewise for attributes.
 
       // Add any result attributes.
-      Attribute Attr = Attrs.getRetAttributes();
-      if (Attr.hasAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                   Attr));
+      if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
+        NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                             Attrs.getRetAttributes()));
 
       {
         unsigned Idx = 1;
@@ -1301,7 +1310,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
             if (NestVal->getType() != NestTy)
               NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest");
             NewArgs.push_back(NestVal);
-            NewAttrs.push_back(AttributeWithIndex::get(NestIdx, NestAttr));
+            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                                 NestAttr));
           }
 
           if (I == E)
@@ -1309,20 +1319,21 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
 
           // Add the original argument and attributes.
           NewArgs.push_back(*I);
-          Attr = Attrs.getParamAttributes(Idx);
-          if (Attr.hasAttributes())
-            NewAttrs.push_back
-              (AttributeWithIndex::get(Idx + (Idx >= NestIdx), Attr));
+          AttributeSet Attr = Attrs.getParamAttributes(Idx);
+          if (Attr.hasAttributes(Idx)) {
+            AttrBuilder B(Attr, Idx);
+            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                                 Idx + (Idx >= NestIdx), B));
+          }
 
           ++Idx, ++I;
         } while (1);
       }
 
       // Add any function attributes.
-      Attr = Attrs.getFnAttributes();
-      if (Attr.hasAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                   Attr));
+      if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+        NewAttrs.push_back(AttributeSet::get(FTy->getContext(),
+                                             Attrs.getFnAttributes()));
 
       // The trampoline may have been bitcast to a bogus type (FTy).
       // Handle this by synthesizing a new function type, equal to FTy
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 5af4442..a960ab2 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -30,7 +30,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
     Scale  = 0;
     return ConstantInt::get(Val->getType(), 0);
   }
-  
+
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
     // Cannot look past anything that might overflow.
     OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
@@ -47,19 +47,19 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
         Offset = 0;
         return I->getOperand(0);
       }
-      
+
       if (I->getOpcode() == Instruction::Mul) {
         // This value is scaled by 'RHS'.
         Scale = RHS->getZExtValue();
         Offset = 0;
         return I->getOperand(0);
       }
-      
+
       if (I->getOpcode() == Instruction::Add) {
-        // We have X+C.  Check to see if we really have (X*C2)+C1, 
+        // We have X+C.  Check to see if we really have (X*C2)+C1,
         // where C1 is divisible by C2.
         unsigned SubScale;
-        Value *SubVal = 
+        Value *SubVal =
           DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
         Offset += RHS->getZExtValue();
         Scale = SubScale;
@@ -82,7 +82,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   if (!TD) return 0;
 
   PointerType *PTy = cast<PointerType>(CI.getType());
-  
+
   BuilderTy AllocaBuilder(*Builder);
   AllocaBuilder.SetInsertPoint(AI.getParent(), &AI);
 
@@ -110,7 +110,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   uint64_t ArrayOffset;
   Value *NumElements = // See if the array size is a decomposable linear expr.
     DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
- 
+
   // If we can now satisfy the modulus, by using a non-1 scale, we really can
   // do the xform.
   if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
@@ -125,17 +125,17 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
     // Insert before the alloca, not before the cast.
     Amt = AllocaBuilder.CreateMul(Amt, NumElements);
   }
-  
+
   if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
     Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
                                   Offset, true);
     Amt = AllocaBuilder.CreateAdd(Amt, Off);
   }
-  
+
   AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
   New->setAlignment(AI.getAlignment());
   New->takeName(&AI);
-  
+
   // If the allocation has multiple real uses, insert a cast and change all
   // things that used it to use the new cast.  This will also hack on CI, but it
   // will die soon.
@@ -148,10 +148,10 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   return ReplaceInstUsesWith(CI, New);
 }
 
-/// EvaluateInDifferentType - Given an expression that 
+/// EvaluateInDifferentType - Given an expression that
 /// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
 /// insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, 
+Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
                                              bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
@@ -181,7 +181,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
     Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
     break;
-  }    
+  }
   case Instruction::Trunc:
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -190,7 +190,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     // new.
     if (I->getOperand(0)->getType() == Ty)
       return I->getOperand(0);
-    
+
     // Otherwise, must be the same type of cast, so just reinsert a new one.
     // This also handles the case of zext(trunc(x)) -> zext(x).
     Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
@@ -212,11 +212,11 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     Res = NPN;
     break;
   }
-  default: 
+  default:
     // TODO: Can handle more cases here.
     llvm_unreachable("Unreachable!");
   }
-  
+
   Res->takeName(I);
   return InsertNewInstWith(Res, *I);
 }
@@ -224,7 +224,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
 
 /// This function is a wrapper around CastInst::isEliminableCastPair. It
 /// simply extracts arguments and returns what that function returns.
-static Instruction::CastOps 
+static Instruction::CastOps
 isEliminableCastPair(
   const CastInst *CI, ///< The first cast instruction
   unsigned opcode,       ///< The opcode of the second cast instruction
@@ -253,7 +253,7 @@ isEliminableCastPair(
   if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) ||
       (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy))
     Res = 0;
-  
+
   return Instruction::CastOps(Res);
 }
 
@@ -265,18 +265,18 @@ bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
                                       Type *Ty) {
   // Noop casts and casts of constants should be eliminated trivially.
   if (V->getType() == Ty || isa<Constant>(V)) return false;
-  
+
   // If this is another cast that can be eliminated, we prefer to have it
   // eliminated.
   if (const CastInst *CI = dyn_cast<CastInst>(V))
     if (isEliminableCastPair(CI, opc, Ty, TD))
       return false;
-  
+
   // If this is a vector sext from a compare, then we don't want to break the
   // idiom where each element of the extended vector is either zero or all ones.
   if (opc == Instruction::SExt && isa<CmpInst>(V) && Ty->isVectorTy())
     return false;
-  
+
   return true;
 }
 
@@ -288,7 +288,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   // Many cases of "cast of a cast" are eliminable. If it's eliminable we just
   // eliminate it now.
   if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
-    if (Instruction::CastOps opc = 
+    if (Instruction::CastOps opc =
         isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
@@ -311,7 +311,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
       if (Instruction *NV = FoldOpIntoPhi(CI))
         return NV;
   }
-  
+
   return 0;
 }
 
@@ -330,15 +330,15 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty) {
   // We can always evaluate constants in another type.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   Type *OrigTy = V->getType();
-  
+
   // If this is an extension from the dest type, we can eliminate it, even if it
   // has multiple uses.
-  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && 
+  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
       I->getOperand(0)->getType() == Ty)
     return true;
 
@@ -423,29 +423,29 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty) {
     // TODO: Can handle more cases here.
     break;
   }
-  
+
   return false;
 }
 
 Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
-  
-  // See if we can simplify any instructions used by the input whose sole 
+
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType(), *SrcTy = Src->getType();
-  
+
   // Attempt to truncate the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
       CanEvaluateTruncated(Src, DestTy)) {
-      
+
     // If this cast is a truncate, evaluting in a different type always
     // eliminates the cast, so it is always a win.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -462,7 +462,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Value *Zero = Constant::getNullValue(Src->getType());
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
-  
+
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
   Value *A = 0; ConstantInt *Cst = 0;
   if (Src->hasOneUse() &&
@@ -472,7 +472,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
     // between ASize and ResultSize.
     unsigned ASize = A->getType()->getPrimitiveSizeInBits();
-    
+
     // If the shift amount is larger than the size of A, then the result is
     // known to be zero because all the input bits got shifted out.
     if (Cst->getZExtValue() >= ASize)
@@ -485,7 +485,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Shift->takeName(Src);
     return CastInst::CreateIntegerCast(Shift, CI.getType(), false);
   }
-  
+
   // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest
   // type isn't non-native.
   if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) &&
@@ -508,7 +508,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
   // cast to integer to avoid the comparison.
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
     const APInt &Op1CV = Op1C->getValue();
-      
+
     // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
     // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
     if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
@@ -538,14 +538,14 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
     // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
     // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
     // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
-    if ((Op1CV == 0 || Op1CV.isPowerOf2()) && 
+    if ((Op1CV == 0 || Op1CV.isPowerOf2()) &&
         // This only works for EQ and NE
         ICI->isEquality()) {
       // If Op1C some other power of two, convert:
       uint32_t BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
       ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne);
-        
+
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
         if (!DoXform) return ICI;
@@ -559,7 +559,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           Res = ConstantExpr::getZExt(Res, CI.getType());
           return ReplaceInstUsesWith(CI, Res);
         }
-          
+
         uint32_t ShiftAmt = KnownZeroMask.logBase2();
         Value *In = ICI->getOperand(0);
         if (ShiftAmt) {
@@ -568,12 +568,12 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           In = Builder->CreateLShr(In, ConstantInt::get(In->getType(),ShiftAmt),
                                    In->getName()+".lobit");
         }
-          
+
         if ((Op1CV != 0) == isNE) { // Toggle the low bit.
           Constant *One = ConstantInt::get(In->getType(), 1);
           In = Builder->CreateXor(In, One);
         }
-          
+
         if (CI.getType() == In->getType())
           return ReplaceInstUsesWith(CI, In);
         return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
@@ -646,19 +646,19 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   BitsToClear = 0;
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If the input is a truncate from the destination type, we can trivially
   // eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
-  
+
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
-  
+
   unsigned Opc = I->getOpcode(), Tmp;
   switch (Opc) {
   case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
@@ -678,7 +678,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
     // These can all be promoted if neither operand has 'bits to clear'.
     if (BitsToClear == 0 && Tmp == 0)
       return true;
-      
+
     // If the operation is an AND/OR/XOR and the bits to clear are zero in the
     // other side, BitsToClear is ok.
     if (Tmp == 0 &&
@@ -691,10 +691,10 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
                             APInt::getHighBitsSet(VSize, BitsToClear)))
         return true;
     }
-      
+
     // Otherwise, we don't know how to analyze this BitsToClear case yet.
     return false;
-      
+
   case Instruction::LShr:
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
@@ -716,7 +716,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
         Tmp != BitsToClear)
       return false;
     return true;
-      
+
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
@@ -739,48 +739,48 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
 }
 
 Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
-  // If this zero extend is only used by a truncate, let the truncate by
+  // If this zero extend is only used by a truncate, let the truncate be
   // eliminated before we try to optimize this zext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
     return 0;
-  
+
   // If one of the common conversion will work, do it.
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
 
-  // See if we can simplify any instructions used by the input whose sole 
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
-  
+
   // Attempt to extend the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
   unsigned BitsToClear;
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateZExtd(Src, DestTy, BitsToClear)) { 
+      CanEvaluateZExtd(Src, DestTy, BitsToClear)) {
     assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
            "Unreasonable BitsToClear");
-    
+
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
           " to avoid zero extend: " << CI);
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
-    
+
     uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
     uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-    
+
     // If the high bits are already filled with zeros, just replace this
     // cast with the result.
     if (MaskedValueIsZero(Res, APInt::getHighBitsSet(DestBitSize,
                                                      DestBitSize-SrcBitsKept)))
       return ReplaceInstUsesWith(CI, Res);
-    
+
     // We need to emit an AND to clear the high bits.
     Constant *C = ConstantInt::get(Res->getType(),
                                APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
@@ -792,7 +792,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // 'and' which will be much cheaper than the pair of casts.
   if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
     // TODO: Subsume this into EvaluateInDifferentType.
-    
+
     // Get the sizes of the types involved.  We know that the intermediate type
     // will be smaller than A or C, but don't know the relation between A and C.
     Value *A = CSrc->getOperand(0);
@@ -809,7 +809,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
       Value *And = Builder->CreateAnd(A, AndConst, CSrc->getName()+".mask");
       return new ZExtInst(And, CI.getType());
     }
-    
+
     if (SrcSize == DstSize) {
       APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
       return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(),
@@ -818,7 +818,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     if (SrcSize > DstSize) {
       Value *Trunc = Builder->CreateTrunc(A, CI.getType());
       APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
-      return BinaryOperator::CreateAnd(Trunc, 
+      return BinaryOperator::CreateAnd(Trunc,
                                        ConstantInt::get(Trunc->getType(),
                                                         AndValue));
     }
@@ -876,7 +876,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     Value *New = Builder->CreateZExt(X, CI.getType());
     return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
   }
-  
+
   return 0;
 }
 
@@ -989,14 +989,14 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
   // If this is a constant, it can be trivially promoted.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If this is a truncate from the dest type, we can trivially eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
-  
+
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
@@ -1015,14 +1015,14 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
     // These operators can all arbitrarily be extended if their inputs can.
     return CanEvaluateSExtd(I->getOperand(0), Ty) &&
            CanEvaluateSExtd(I->getOperand(1), Ty);
-      
+
   //case Instruction::Shl:   TODO
   //case Instruction::LShr:  TODO
-      
+
   case Instruction::Select:
     return CanEvaluateSExtd(I->getOperand(1), Ty) &&
            CanEvaluateSExtd(I->getOperand(2), Ty);
-      
+
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
@@ -1036,24 +1036,24 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
     // TODO: Can handle more cases here.
     break;
   }
-  
+
   return false;
 }
 
 Instruction *InstCombiner::visitSExt(SExtInst &CI) {
-  // If this sign extend is only used by a truncate, let the truncate by
-  // eliminated before we try to optimize this zext.
+  // If this sign extend is only used by a truncate, let the truncate be
+  // eliminated before we try to optimize this sext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
     return 0;
-  
+
   if (Instruction *I = commonCastTransforms(CI))
     return I;
-  
-  // See if we can simplify any instructions used by the input whose sole 
+
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
@@ -1076,7 +1076,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     // cast with the result.
     if (ComputeNumSignBits(Res) > DestBitSize - SrcBitSize)
       return ReplaceInstUsesWith(CI, Res);
-    
+
     // We need to emit a shl + ashr to do the sign extend.
     Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
     return BinaryOperator::CreateAShr(Builder->CreateShl(Res, ShAmt, "sext"),
@@ -1089,7 +1089,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) {
       uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
       uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-      
+
       // We need to emit a shl + ashr to do the sign extend.
       Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
       Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext");
@@ -1125,7 +1125,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     A = Builder->CreateShl(A, ShAmtV, CI.getName());
     return BinaryOperator::CreateAShr(A, ShAmtV);
   }
-  
+
   return 0;
 }
 
@@ -1147,7 +1147,7 @@ static Value *LookThroughFPExtensions(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     if (I->getOpcode() == Instruction::FPExt)
       return LookThroughFPExtensions(I->getOperand(0));
-  
+
   // If this value is a constant, return the constant in the smallest FP type
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
@@ -1166,14 +1166,14 @@ static Value *LookThroughFPExtensions(Value *V) {
       return V;
     // Don't try to shrink to various long double types.
   }
-  
+
   return V;
 }
 
 Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
-  
+
   // If we have fptrunc(fadd (fpextend x), (fpextend y)), where x and y are
   // smaller than the destination type, we can eliminate the truncate by doing
   // the add as the smaller type.  This applies to fadd/fsub/fmul/fdiv as well
@@ -1190,7 +1190,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       Type *SrcTy = OpI->getType();
       Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
       Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
-      if (LHSTrunc->getType() != SrcTy && 
+      if (LHSTrunc->getType() != SrcTy &&
           RHSTrunc->getType() != SrcTy) {
         unsigned DstSize = CI.getType()->getScalarSizeInBits();
         // If the source types were both smaller than the destination type of
@@ -1202,10 +1202,36 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
           return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
         }
       }
-      break;  
+      break;
+    }
+
+    // (fptrunc (fneg x)) -> (fneg (fptrunc x))
+    if (BinaryOperator::isFNeg(OpI)) {
+      Value *InnerTrunc = Builder->CreateFPTrunc(OpI->getOperand(1),
+                                                 CI.getType());
+      return BinaryOperator::CreateFNeg(InnerTrunc);
+    }
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
+  if (II) {
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::fabs: {
+        // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+        Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
+                                                   CI.getType());
+        Type *IntrinsicType[] = { CI.getType() };
+        Function *Overload =
+          Intrinsic::getDeclaration(CI.getParent()->getParent()->getParent(),
+                                    II->getIntrinsicID(), IntrinsicType);
+
+        Value *Args[] = { InnerTrunc };
+        return CallInst::Create(Overload, Args, II->getName());
+      }
     }
   }
-  
+
   // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
   CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
   if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
@@ -1220,7 +1246,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         Arg->getOperand(0)->getType()->isFloatTy()) {
       Function *Callee = Call->getCalledFunction();
       Module *M = CI.getParent()->getParent()->getParent();
-      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf", 
+      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf",
                                                    Callee->getAttributes(),
                                                    Builder->getFloatTy(),
                                                    Builder->getFloatTy(),
@@ -1228,15 +1254,15 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
                                        "sqrtfcall");
       ret->setAttributes(Callee->getAttributes());
-      
-      
+
+
       // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
       ReplaceInstUsesWith(*Call, UndefValue::get(Call->getType()));
       EraseInstFromFunction(*Call);
       return ret;
     }
   }
-  
+
   return 0;
 }
 
@@ -1254,7 +1280,7 @@ Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
   // This is safe if the intermediate type has enough bits in its mantissa to
   // accurately represent all values of X.  For example, do not do this with
   // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui. 
+  // 'X' value would cause an undefined result for the fptoui.
   if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
       OpI->getOperand(0)->getType() == FI.getType() &&
       (int)FI.getType()->getScalarSizeInBits() < /*extra bit for sign */
@@ -1268,19 +1294,19 @@ Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
   Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
   if (OpI == 0)
     return commonCastTransforms(FI);
-  
+
   // fptosi(sitofp(X)) --> X
   // fptosi(uitofp(X)) --> X
   // This is safe if the intermediate type has enough bits in its mantissa to
   // accurately represent all values of X.  For example, do not do this with
   // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui. 
+  // 'X' value would cause an undefined result for the fptoui.
   if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
       OpI->getOperand(0)->getType() == FI.getType() &&
       (int)FI.getType()->getScalarSizeInBits() <=
                     OpI->getType()->getFPMantissaWidth())
     return ReplaceInstUsesWith(FI, OpI->getOperand(0));
-  
+
   return commonCastTransforms(FI);
 }
 
@@ -1296,21 +1322,16 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   // If the source integer type is not the intptr_t type for this target, do a
   // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
   // cast to be exposed to other transforms.
-  if (TD) {
-    if (CI.getOperand(0)->getType()->getScalarSizeInBits() >
-        TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreateTrunc(CI.getOperand(0),
-                                      TD->getIntPtrType(CI.getContext()));
-      return new IntToPtrInst(P, CI.getType());
-    }
-    if (CI.getOperand(0)->getType()->getScalarSizeInBits() <
-        TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreateZExt(CI.getOperand(0),
-                                     TD->getIntPtrType(CI.getContext()));
-      return new IntToPtrInst(P, CI.getType());
-    }
+  if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+      TD->getPointerSizeInBits()) {
+    Type *Ty = TD->getIntPtrType(CI.getContext());
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
+    return new IntToPtrInst(P, CI.getType());
   }
-  
+
   if (Instruction *I = commonCastTransforms(CI))
     return I;
 
@@ -1320,19 +1341,19 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
 /// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
 Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
-  
+
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
     // If casting the result of a getelementptr instruction with no offset, turn
     // this into a cast of the original pointer!
     if (GEP->hasAllZeroIndices()) {
       // Changing the cast operand is usually not a good idea but it is safe
-      // here because the pointer operand is being replaced with another 
+      // here because the pointer operand is being replaced with another
       // pointer operand so the opcode doesn't need to change.
       Worklist.Add(GEP);
       CI.setOperand(0, GEP->getOperand(0));
       return &CI;
     }
-    
+
     // If the GEP has a single use, and the base pointer is a bitcast, and the
     // GEP computes a constant offset, see if we can convert these three
     // instructions into fewer.  This typically happens with unions and other
@@ -1353,15 +1374,15 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
         Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
         Builder->CreateGEP(OrigBase, NewIndices);
         NGEP->takeName(GEP);
-        
+
         if (isa<BitCastInst>(CI))
           return new BitCastInst(NGEP, CI.getType());
         assert(isa<PtrToIntInst>(CI));
         return new PtrToIntInst(NGEP, CI.getType());
-      }      
+      }
     }
   }
-  
+
   return commonCastTransforms(CI);
 }
 
@@ -1369,19 +1390,15 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   // If the destination integer type is not the intptr_t type for this target,
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
   // to be exposed to other transforms.
-  if (TD) {
-    if (CI.getType()->getScalarSizeInBits() < TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()));
-      return new TruncInst(P, CI.getType());
-    }
-    if (CI.getType()->getScalarSizeInBits() > TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()));
-      return new ZExtInst(P, CI.getType());
-    }
+  if (TD && CI.getType()->getScalarSizeInBits() != TD->getPointerSizeInBits()) {
+    Type *Ty = TD->getIntPtrType(CI.getContext());
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder->CreatePtrToInt(CI.getOperand(0), Ty);
+    return CastInst::CreateIntegerCast(P, CI.getType(), /*isSigned=*/false);
   }
-  
+
   return commonPointerCastTransforms(CI);
 }
 
@@ -1396,33 +1413,33 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
   VectorType *SrcTy = cast<VectorType>(InVal->getType());
-  
+
   if (SrcTy->getElementType() != DestTy->getElementType()) {
     // The input types don't need to be identical, but for now they must be the
     // same size.  There is no specific reason we couldn't handle things like
     // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
-    // there yet. 
+    // there yet.
     if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
         DestTy->getElementType()->getPrimitiveSizeInBits())
       return 0;
-    
+
     SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
     InVal = IC.Builder->CreateBitCast(InVal, SrcTy);
   }
-  
+
   // Now that the element types match, get the shuffle mask and RHS of the
   // shuffle to use, which depends on whether we're increasing or decreasing the
   // size of the input.
   SmallVector<uint32_t, 16> ShuffleMask;
   Value *V2;
-  
+
   if (SrcTy->getNumElements() > DestTy->getNumElements()) {
     // If we're shrinking the number of elements, just shuffle in the low
     // elements from the input and use undef as the second shuffle input.
     V2 = UndefValue::get(SrcTy);
     for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
       ShuffleMask.push_back(i);
-    
+
   } else {
     // If we're increasing the number of elements, shuffle in all of the
     // elements from InVal and fill the rest of the result elements with zeros
@@ -1436,7 +1453,7 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
     for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
       ShuffleMask.push_back(SrcElts);
   }
-  
+
   return new ShuffleVectorInst(InVal, V2,
                                ConstantDataVector::get(V2->getContext(),
                                                        ShuffleMask));
@@ -1463,7 +1480,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                      Type *VecEltTy) {
   // Undef values never contribute useful bits to the result.
   if (isa<UndefValue>(V)) return true;
-  
+
   // If we got down to a value of the right type, we win, try inserting into the
   // right element.
   if (V->getType() == VecEltTy) {
@@ -1471,15 +1488,15 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (Constant *C = dyn_cast<Constant>(V))
       if (C->isNullValue())
         return true;
-    
+
     // Fail if multiple elements are inserted into this slot.
     if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
       return false;
-    
+
     Elements[ElementIndex] = V;
     return true;
   }
-  
+
   if (Constant *C = dyn_cast<Constant>(V)) {
     // Figure out the # elements this provides, and bitcast it or slice it up
     // as required.
@@ -1490,7 +1507,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (NumElts == 1)
       return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
                                       ElementIndex, Elements, VecEltTy);
-    
+
     // Okay, this is a constant that covers multiple elements.  Slice it up into
     // pieces and insert each element-sized piece into the vector.
     if (!isa<IntegerType>(C->getType()))
@@ -1498,7 +1515,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                        C->getType()->getPrimitiveSizeInBits()));
     unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
     Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
-    
+
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
                                                                i*ElementSize));
@@ -1508,23 +1525,23 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     }
     return true;
   }
-  
+
   if (!V->hasOneUse()) return false;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return false;
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);  
+                                    Elements, VecEltTy);
   case Instruction::ZExt:
     if (!isMultipleOfTypeSize(
                           I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
                               VecEltTy))
       return false;
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);  
+                                    Elements, VecEltTy);
   case Instruction::Or:
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
                                     Elements, VecEltTy) &&
@@ -1536,11 +1553,11 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (CI == 0) return false;
     if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
     unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
-    
+
     return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
                                     Elements, VecEltTy);
   }
-      
+
   }
 }
 
@@ -1575,11 +1592,11 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
   Value *Result = Constant::getNullValue(CI.getType());
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
     if (Elements[i] == 0) continue;  // Unset element.
-    
+
     Result = IC.Builder->CreateInsertElement(Result, Elements[i],
                                              IC.Builder->getInt32(i));
   }
-  
+
   return Result;
 }
 
@@ -1607,11 +1624,11 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
                                 VecTy->getPrimitiveSizeInBits() / DestWidth);
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
-    
+
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(0));
     }
   }
-  
+
   // bitcast(trunc(lshr(bitcast(somevector), cst))
   ConstantInt *ShAmt = 0;
   if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
@@ -1628,7 +1645,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
                                 VecTy->getPrimitiveSizeInBits() / DestWidth);
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
-      
+
       unsigned Elt = ShAmt->getZExtValue() / DestWidth;
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
@@ -1652,12 +1669,12 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     PointerType *SrcPTy = cast<PointerType>(SrcTy);
     Type *DstElTy = DstPTy->getElementType();
     Type *SrcElTy = SrcPTy->getElementType();
-    
+
     // If the address spaces don't match, don't eliminate the bitcast, which is
     // required for changing types.
     if (SrcPTy->getAddressSpace() != DstPTy->getAddressSpace())
       return 0;
-    
+
     // If we are casting a alloca to a pointer to a type of the same
     // size, rewrite the allocation instruction to allocate the "right" type.
     // There is no need to modify malloc calls because it is their bitcast that
@@ -1665,14 +1682,14 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
       if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
         return V;
-    
+
     // If the source and destination are pointers, and this cast is equivalent
     // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
     // This can enhance SROA and other transforms that want type-safe pointers.
     Constant *ZeroUInt =
       Constant::getNullValue(Type::getInt32Ty(CI.getContext()));
     unsigned NumZeros = 0;
-    while (SrcElTy != DstElTy && 
+    while (SrcElTy != DstElTy &&
            isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() &&
            SrcElTy->getNumContainedTypes() /* not "{}" */) {
       SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt);
@@ -1685,7 +1702,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       return GetElementPtrInst::CreateInBounds(Src, Idxs);
     }
   }
-  
+
   // Try to optimize int -> float bitcasts.
   if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy))
     if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
@@ -1698,7 +1715,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                      Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
       // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
     }
-    
+
     if (isa<IntegerType>(SrcTy)) {
       // If this is a cast from an integer to vector, check to see if the input
       // is a trunc or zext of a bitcast from vector.  If so, we can replace all
@@ -1711,7 +1728,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                                                cast<VectorType>(DestTy), *this))
               return I;
       }
-      
+
       // If the input is an 'or' instruction, we may be doing shifts and ors to
       // assemble the elements of the vector manually.  Try to rip the code out
       // and replace it with insertelements.
@@ -1721,18 +1738,29 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   }
 
   if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
-    if (SrcVTy->getNumElements() == 1 && !DestTy->isVectorTy()) {
-      Value *Elem = 
-        Builder->CreateExtractElement(Src,
-                   Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
-      return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+    if (SrcVTy->getNumElements() == 1) {
+      // If our destination is not a vector, then make this a straight
+      // scalar-scalar cast.
+      if (!DestTy->isVectorTy()) {
+        Value *Elem =
+          Builder->CreateExtractElement(Src,
+                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+        return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+      }
+
+      // Otherwise, see if our source is an insert. If so, then use the scalar
+      // component directly.
+      if (InsertElementInst *IEI =
+            dyn_cast<InsertElementInst>(CI.getOperand(0)))
+        return CastInst::Create(Instruction::BitCast, IEI->getOperand(1),
+                                DestTy);
     }
   }
 
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
     // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
     // a bitcast to a vector with the same # elts.
-    if (SVI->hasOneUse() && DestTy->isVectorTy() && 
+    if (SVI->hasOneUse() && DestTy->isVectorTy() &&
         cast<VectorType>(DestTy)->getNumElements() ==
               SVI->getType()->getNumElements() &&
         SVI->getType()->getNumElements() ==
@@ -1741,9 +1769,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
       // us to eliminate at least one cast.
-      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) && 
+      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) &&
            Tmp->getOperand(0)->getType() == DestTy) ||
-          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) && 
+          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) &&
            Tmp->getOperand(0)->getType() == DestTy)) {
         Value *LHS = Builder->CreateBitCast(SVI->getOperand(0), DestTy);
         Value *RHS = Builder->CreateBitCast(SVI->getOperand(1), DestTy);
@@ -1753,7 +1781,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       }
     }
   }
-  
+
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 40e559e..bad46b4 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1331,6 +1331,25 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
                           And, Constant::getNullValue(And->getType()));
     }
+
+    // Transform (icmp pred iM (shl iM %v, N), CI)
+    // -> (icmp pred i(M-N) (trunc %v iM to i(N-N)), (trunc (CI>>N))
+    // Transform the shl to a trunc if (trunc (CI>>N)) has no loss.
+    // This enables to get rid of the shift in favor of a trunc which can be
+    // free on the target. It has the additional benefit of comparing to a
+    // smaller constant, which will be target friendly.
+    unsigned Amt = ShAmt->getLimitedValue(TypeBits-1);
+    if (Amt != 0 && RHSV.countTrailingZeros() >= Amt) {
+      Type *NTy = IntegerType::get(ICI.getContext(), TypeBits - Amt);
+      Constant *NCI = ConstantExpr::getTrunc(
+                        ConstantExpr::getAShr(RHS,
+                          ConstantInt::get(RHS->getType(), Amt)),
+                        NTy);
+      return new ICmpInst(ICI.getPredicate(),
+                          Builder->CreateTrunc(LHSI->getOperand(0), NTy),
+                          NCI);
+    }
+
     break;
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index d0f4392..8e4267f 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -377,6 +377,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), TD))
     return ReplaceInstUsesWith(I, V);
 
+  bool AllowReassociate = I.hasUnsafeAlgebra();
+
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
     // Try to fold constant mul into select arguments.
@@ -389,7 +391,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
         return NV;
 
     ConstantFP *C = dyn_cast<ConstantFP>(Op1);
-    if (C && I.hasUnsafeAlgebra() && C->getValueAPF().isNormal()) {
+    if (C && AllowReassociate && C->getValueAPF().isNormal()) {
       // Let MDC denote an expression in one of these forms:
       // X * C, C/X, X/C, where C is a constant.
       //
@@ -430,7 +432,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
                         BinaryOperator::CreateFAdd(M0, M1) :
                         BinaryOperator::CreateFSub(M0, M1);
             Instruction *RI = cast<Instruction>(R);
-            RI->setHasUnsafeAlgebra(true);
+            RI->copyFastMathFlags(&I);
             return RI;
           }
         }
@@ -438,9 +440,6 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
   }
 
-  if (Value *Op0v = dyn_castFNegVal(Op0))     // -X * -Y = X*Y
-    if (Value *Op1v = dyn_castFNegVal(Op1))
-      return BinaryOperator::CreateFMul(Op0v, Op1v);
 
   // Under unsafe algebra do:
   // X * log2(0.5*Y) = X*log2(Y) - X
@@ -469,36 +468,66 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
   }
 
-  // X * cond ? 1.0 : 0.0 => cond ? X : 0.0
-  if (I.hasNoNaNs() && I.hasNoSignedZeros()) {
-    Value *V0 = I.getOperand(0);
-    Value *V1 = I.getOperand(1);
-    Value *Cond, *SLHS, *SRHS;
-    bool Match = false;
-
-    if (match(V0, m_Select(m_Value(Cond), m_Value(SLHS), m_Value(SRHS)))) {
-      Match = true;
-    } else if (match(V1, m_Select(m_Value(Cond), m_Value(SLHS), 
-                     m_Value(SRHS)))) {
-      Match = true;
-      std::swap(V0, V1);
+  // Handle symmetric situation in a 2-iteration loop
+  Value *Opnd0 = Op0;
+  Value *Opnd1 = Op1;
+  for (int i = 0; i < 2; i++) {
+    bool IgnoreZeroSign = I.hasNoSignedZeros();
+    if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
+      Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
+      Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
+
+      // -X * -Y => X*Y
+      if (N1)
+        return BinaryOperator::CreateFMul(N0, N1);
+
+      if (Opnd0->hasOneUse()) {
+        // -X * Y => -(X*Y) (Promote negation as high as possible)
+        Value *T = Builder->CreateFMul(N0, Opnd1);
+        cast<Instruction>(T)->setDebugLoc(I.getDebugLoc());
+        Instruction *Neg = BinaryOperator::CreateFNeg(T);
+        if (I.getFastMathFlags().any()) {
+          cast<Instruction>(T)->copyFastMathFlags(&I);
+          Neg->copyFastMathFlags(&I);
+        }
+        return Neg;
+      }
     }
 
-    if (Match) {
-      ConstantFP *C0 = dyn_cast<ConstantFP>(SLHS);
-      ConstantFP *C1 = dyn_cast<ConstantFP>(SRHS);
-
-      if (C0 && C1 &&
-          ((C0->isZero() && C1->isExactlyValue(1.0)) ||
-           (C1->isZero() && C0->isExactlyValue(1.0)))) {
-        Value *T;
-        if (C0->isZero())
-          T = Builder->CreateSelect(Cond, SLHS, V1);
-        else
-          T = Builder->CreateSelect(Cond, V1, SRHS);
-        return ReplaceInstUsesWith(I, T);
+    // (X*Y) * X => (X*X) * Y where Y != X
+    //  The purpose is two-fold: 
+    //   1) to form a power expression (of X).
+    //   2) potentially shorten the critical path: After transformation, the
+    //  latency of the instruction Y is amortized by the expression of X*X,
+    //  and therefore Y is in a "less critical" position compared to what it
+    //  was before the transformation.
+    //
+    if (AllowReassociate) {
+      Value *Opnd0_0, *Opnd0_1;
+      if (Opnd0->hasOneUse() &&
+          match(Opnd0, m_FMul(m_Value(Opnd0_0), m_Value(Opnd0_1)))) {
+        Value *Y = 0;
+        if (Opnd0_0 == Opnd1 && Opnd0_1 != Opnd1)
+          Y = Opnd0_1;
+        else if (Opnd0_1 == Opnd1 && Opnd0_0 != Opnd1)
+          Y = Opnd0_0;
+
+        if (Y) {
+          Instruction *T = cast<Instruction>(Builder->CreateFMul(Opnd1, Opnd1));
+          T->copyFastMathFlags(&I);
+          T->setDebugLoc(I.getDebugLoc());
+
+          Instruction *R = BinaryOperator::CreateFMul(T, Y);
+          R->copyFastMathFlags(&I);
+          return R;
+        }
       }
     }
+
+    if (!isa<Constant>(Op1))
+      std::swap(Opnd0, Opnd1);
+    else
+      break;
   }
 
   return Changed ? &I : 0;
@@ -784,21 +813,140 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   return 0;
 }
 
+/// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
+/// FP value and:
+///    1) 1/C is exact, or 
+///    2) reciprocal is allowed.
+/// If the convertion was successful, the simplified expression "X * 1/C" is
+/// returned; otherwise, NULL is returned.
+///
+static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
+                                             ConstantFP *Divisor,
+                                             bool AllowReciprocal) {
+  const APFloat &FpVal = Divisor->getValueAPF();
+  APFloat Reciprocal(FpVal.getSemantics());
+  bool Cvt = FpVal.getExactInverse(&Reciprocal);
+    
+  if (!Cvt && AllowReciprocal && FpVal.isNormal()) {
+    Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
+    (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
+    Cvt = !Reciprocal.isDenormal();
+  }
+
+  if (!Cvt)
+    return 0;
+
+  ConstantFP *R;
+  R = ConstantFP::get(Dividend->getType()->getContext(), Reciprocal);
+  return BinaryOperator::CreateFMul(Dividend, R);
+}
+
 Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyFDivInst(Op0, Op1, TD))
     return ReplaceInstUsesWith(I, V);
 
+  bool AllowReassociate = I.hasUnsafeAlgebra();
+  bool AllowReciprocal = I.hasAllowReciprocal();
+
   if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
-    const APFloat &Op1F = Op1C->getValueAPF();
-
-    // If the divisor has an exact multiplicative inverse we can turn the fdiv
-    // into a cheaper fmul.
-    APFloat Reciprocal(Op1F.getSemantics());
-    if (Op1F.getExactInverse(&Reciprocal)) {
-      ConstantFP *RFP = ConstantFP::get(Builder->getContext(), Reciprocal);
-      return BinaryOperator::CreateFMul(Op0, RFP);
+    if (AllowReassociate) {
+      ConstantFP *C1 = 0;
+      ConstantFP *C2 = Op1C;
+      Value *X;
+      Instruction *Res = 0;
+
+      if (match(Op0, m_FMul(m_Value(X), m_ConstantFP(C1)))) {
+        // (X*C1)/C2 => X * (C1/C2)
+        //
+        Constant *C = ConstantExpr::getFDiv(C1, C2);
+        const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
+        if (F.isNormal() && !F.isDenormal())
+          Res = BinaryOperator::CreateFMul(X, C);
+      } else if (match(Op0, m_FDiv(m_Value(X), m_ConstantFP(C1)))) {
+        // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed]
+        //
+        Constant *C = ConstantExpr::getFMul(C1, C2);
+        const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
+        if (F.isNormal() && !F.isDenormal()) {
+          Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C), 
+                                         AllowReciprocal);
+          if (!Res)
+            Res = BinaryOperator::CreateFDiv(X, C); 
+        }
+      }
+
+      if (Res) {
+        Res->setFastMathFlags(I.getFastMathFlags());
+        return Res;
+      }
+    }
+
+    // X / C => X * 1/C
+    if (Instruction *T = CvtFDivConstToReciprocal(Op0, Op1C, AllowReciprocal))
+      return T;
+
+    return 0;
+  }
+
+  if (AllowReassociate && isa<ConstantFP>(Op0)) {
+    ConstantFP *C1 = cast<ConstantFP>(Op0), *C2;
+    Constant *Fold = 0;
+    Value *X;
+    bool CreateDiv = true;
+
+    // C1 / (X*C2) => (C1/C2) / X
+    if (match(Op1, m_FMul(m_Value(X), m_ConstantFP(C2))))
+      Fold = ConstantExpr::getFDiv(C1, C2);
+    else if (match(Op1, m_FDiv(m_Value(X), m_ConstantFP(C2)))) {
+      // C1 / (X/C2) => (C1*C2) / X
+      Fold = ConstantExpr::getFMul(C1, C2);
+    } else if (match(Op1, m_FDiv(m_ConstantFP(C2), m_Value(X)))) {
+      // C1 / (C2/X) => (C1/C2) * X
+      Fold = ConstantExpr::getFDiv(C1, C2);
+      CreateDiv = false;
+    }
+
+    if (Fold) {
+      const APFloat &FoldC = cast<ConstantFP>(Fold)->getValueAPF();
+      if (FoldC.isNormal() && !FoldC.isDenormal()) {
+        Instruction *R = CreateDiv ? 
+                         BinaryOperator::CreateFDiv(Fold, X) :
+                         BinaryOperator::CreateFMul(X, Fold);
+        R->setFastMathFlags(I.getFastMathFlags());
+        return R;
+      }
+    }
+    return 0;
+  }
+
+  if (AllowReassociate) {
+    Value *X, *Y;
+    Value *NewInst = 0;
+    Instruction *SimpR = 0;
+
+    if (Op0->hasOneUse() && match(Op0, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // (X/Y) / Z => X / (Y*Z)
+      //
+      if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op1)) {
+        NewInst = Builder->CreateFMul(Y, Op1);
+        SimpR = BinaryOperator::CreateFDiv(X, NewInst);
+      }
+    } else if (Op1->hasOneUse() && match(Op1, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // Z / (X/Y) => Z*Y / X
+      //
+      if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op0)) {
+        NewInst = Builder->CreateFMul(Op0, Y);
+        SimpR = BinaryOperator::CreateFDiv(NewInst, X);
+      }
+    }
+
+    if (NewInst) {
+      if (Instruction *T = dyn_cast<Instruction>(NewInst))
+        T->setDebugLoc(I.getDebugLoc());
+      SimpR->setFastMathFlags(I.getFastMathFlags());
+      return SimpR;
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index dd7ea14..4f71db1 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -13,7 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
+#include "llvm/Support/PatternMatch.h"
 using namespace llvm;
+using namespace PatternMatch;
 
 /// CheapToScalarize - Return true if the value is cheaper to scalarize than it
 /// is to leave as a vector operation.  isConstant indicates whether we're
@@ -92,6 +94,13 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
     return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
   }
 
+  // Extract a value from a vector add operation with a constant zero.
+  Value *Val = 0; Constant *Con = 0;
+  if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) {
+    if (Con->getAggregateElement(EltNo)->isNullValue())
+      return FindScalarElement(Val, EltNo);
+  }
+
   // Otherwise, we don't know.
   return 0;
 }
@@ -295,12 +304,12 @@ static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask,
     Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
     return V;
   }
-  
+
   if (isa<ConstantAggregateZero>(V)) {
     Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0));
     return V;
   }
-  
+
   if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
     // If this is an insert of an extract from some other vector, include it.
     Value *VecOp    = IEI->getOperand(0);
@@ -595,12 +604,12 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   // ShuffleVectorInst is equivalent to the original one.
   for (unsigned i = 0; i < VWidth; ++i) {
     int eltMask;
-    if (Mask[i] == -1) {
+    if (Mask[i] < 0) {
       // This element is an undef value.
       eltMask = -1;
     } else if (Mask[i] < (int)LHSWidth) {
       // This element is from left hand side vector operand.
-      // 
+      //
       // If LHS is going to be replaced (case 1, 2, or 4), calculate the
       // new mask value for the element.
       if (newLHS != LHS) {
@@ -609,8 +618,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
         // with a -1 mask value.
         if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
           eltMask = -1;
-      }
-      else
+      } else
         eltMask = Mask[i];
     } else {
       // This element is from right hand side vector operand
@@ -630,8 +638,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
                  && "should have been check above");
           eltMask = -1;
         }
-      }
-      else
+      } else
         eltMask = Mask[i]-LHSWidth;
 
       // If LHS's width is changed, shift the mask value accordingly.
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 57ed9e3..49efce5 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -19,20 +19,20 @@
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
-  
+
 /// InstCombineWorklist - This is the worklist management logic for
 /// InstCombine.
 class LLVM_LIBRARY_VISIBILITY InstCombineWorklist {
   SmallVector<Instruction*, 256> Worklist;
   DenseMap<Instruction*, unsigned> WorklistMap;
-  
+
   void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION;
   InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION;
 public:
   InstCombineWorklist() {}
-  
+
   bool isEmpty() const { return Worklist.empty(); }
-  
+
   /// Add - Add the specified instruction to the worklist if it isn't already
   /// in it.
   void Add(Instruction *I) {
@@ -41,12 +41,12 @@ public:
       Worklist.push_back(I);
     }
   }
-  
+
   void AddValue(Value *V) {
     if (Instruction *I = dyn_cast<Instruction>(V))
       Add(I);
   }
-  
+
   /// AddInitialGroup - Add the specified batch of stuff in reverse order.
   /// which should only be done when the worklist is empty and when the group
   /// has no duplicates.
@@ -61,25 +61,25 @@ public:
       Worklist.push_back(I);
     }
   }
-  
+
   // Remove - remove I from the worklist if it exists.
   void Remove(Instruction *I) {
     DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
     if (It == WorklistMap.end()) return; // Not in worklist.
-    
+
     // Don't bother moving everything down, just null out the slot.
     Worklist[It->second] = 0;
-    
+
     WorklistMap.erase(It);
   }
-  
+
   Instruction *RemoveOne() {
     Instruction *I = Worklist.back();
     Worklist.pop_back();
     WorklistMap.erase(I);
     return I;
   }
-  
+
   /// AddUsersToWorkList - When an instruction is simplified, add all users of
   /// the instruction to the work lists because they might get more simplified
   /// now.
@@ -89,18 +89,18 @@ public:
          UI != UE; ++UI)
       Add(cast<Instruction>(*UI));
   }
-  
-  
+
+
   /// Zap - check that the worklist is empty and nuke the backing store for
   /// the map if it is large.
   void Zap() {
     assert(WorklistMap.empty() && "Worklist empty, but map not?");
-    
+
     // Do an explicit clear, this shrinks the map if needed.
     WorklistMap.clear();
   }
 };
-  
+
 } // end namespace llvm.
 
 #endif
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6f24cdd..c6115e3 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -162,6 +162,21 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
   return !Overflow;
 }
 
+/// Conservatively clears subclassOptionalData after a reassociation or
+/// commutation. We preserve fast-math flags when applicable as they can be
+/// preserved.
+static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
+  FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+  if (!FPMO) {
+    I.clearSubclassOptionalData();
+    return;
+  }
+
+  FastMathFlags FMF = I.getFastMathFlags();
+  I.clearSubclassOptionalData();
+  I.setFastMathFlags(FMF);
+}
+
 /// SimplifyAssociativeOrCommutative - This performs a few simplifications for
 /// operators which are associative or commutative:
 //
@@ -219,7 +234,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
             I.clearSubclassOptionalData();
             I.setHasNoSignedWrap(true);
           } else {
-            I.clearSubclassOptionalData();
+            ClearSubclassDataAfterReassociation(I);
           }
 
           Changed = true;
@@ -241,7 +256,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, C);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          ClearSubclassDataAfterReassociation(I);
           Changed = true;
           ++NumReassoc;
           continue;
@@ -263,7 +278,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, B);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          ClearSubclassDataAfterReassociation(I);
           Changed = true;
           ++NumReassoc;
           continue;
@@ -283,7 +298,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, V);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          ClearSubclassDataAfterReassociation(I);
           Changed = true;
           ++NumReassoc;
           continue;
@@ -310,7 +325,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         I.setOperand(1, Folded);
         // Conservatively clear the optional flags, since they may not be
         // preserved by the reassociation.
-        I.clearSubclassOptionalData();
+        ClearSubclassDataAfterReassociation(I);
 
         Changed = true;
         continue;
@@ -516,8 +531,8 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
 // instruction if the LHS is a constant negative zero (which is the 'negate'
 // form).
 //
-Value *InstCombiner::dyn_castFNegVal(Value *V) const {
-  if (BinaryOperator::isFNeg(V))
+Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
+  if (BinaryOperator::isFNeg(V, IgnoreZeroSign))
     return BinaryOperator::getFNegArgument(V);
 
   // Constants can be considered to be negated values if they can be folded.
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 9bd3239..6877475 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -16,7 +16,6 @@
 #define DEBUG_TYPE "asan"
 
 #include "llvm/Transforms/Instrumentation.h"
-#include "BlackList.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -36,6 +35,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InstVisitor.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
@@ -43,6 +43,7 @@
 #include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
@@ -53,7 +54,8 @@ using namespace llvm;
 static const uint64_t kDefaultShadowScale = 3;
 static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
 static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
-static const uint64_t kDefaultShadowOffsetAndroid = 0;
+static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000;  // < 2G.
+static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
 
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
@@ -63,11 +65,13 @@ static const char *kAsanModuleCtorName = "asan.module_ctor";
 static const char *kAsanModuleDtorName = "asan.module_dtor";
 static const int   kAsanCtorAndCtorPriority = 1;
 static const char *kAsanReportErrorTemplate = "__asan_report_";
+static const char *kAsanReportLoadN = "__asan_report_load_n";
+static const char *kAsanReportStoreN = "__asan_report_store_n";
 static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals";
 static const char *kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *kAsanInitName = "__asan_init";
+static const char *kAsanInitName = "__asan_init_v1";
 static const char *kAsanHandleNoReturnName = "__asan_handle_no_return";
 static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
 static const char *kAsanMappingScaleName = "__asan_mapping_scale";
@@ -133,6 +137,9 @@ static cl::opt<int> ClMappingScale("asan-mapping-scale",
        cl::desc("scale of asan shadow mapping"), cl::Hidden, cl::init(0));
 static cl::opt<int> ClMappingOffsetLog("asan-mapping-offset-log",
        cl::desc("offset of asan shadow mapping"), cl::Hidden, cl::init(-1));
+static cl::opt<bool> ClShort64BitOffset("asan-short-64bit-mapping-offset",
+       cl::desc("Use short immediate constant as the mapping offset for 64bit"),
+       cl::Hidden, cl::init(true));
 
 // Optimization flags. Not user visible, used mostly for testing
 // and benchmarking the tool.
@@ -186,14 +193,53 @@ class SetOfDynamicallyInitializedGlobals {
   SmallSet<GlobalValue*, 32> DynInitGlobals;
 };
 
-static int MappingScale() {
-  return ClMappingScale ? ClMappingScale : kDefaultShadowScale;
+/// This struct defines the shadow mapping using the rule:
+///   shadow = (mem >> Scale) ADD-or-OR Offset.
+struct ShadowMapping {
+  int Scale;
+  uint64_t Offset;
+  bool OrShadowOffset;
+};
+
+static ShadowMapping getShadowMapping(const Module &M, int LongSize,
+                                      bool ZeroBaseShadow) {
+  llvm::Triple TargetTriple(M.getTargetTriple());
+  bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
+  bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX;
+  bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64;
+  bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+
+  ShadowMapping Mapping;
+
+  // OR-ing shadow offset if more efficient (at least on x86),
+  // but on ppc64 we have to use add since the shadow offset is not neccesary
+  // 1/8-th of the address space.
+  Mapping.OrShadowOffset = !IsPPC64 && !ClShort64BitOffset;
+
+  Mapping.Offset = (IsAndroid || ZeroBaseShadow) ? 0 :
+      (LongSize == 32 ? kDefaultShadowOffset32 :
+       IsPPC64 ? kPPC64_ShadowOffset64 : kDefaultShadowOffset64);
+  if (!ZeroBaseShadow && ClShort64BitOffset && IsX86_64 && !IsMacOSX) {
+    assert(LongSize == 64);
+    Mapping.Offset = kDefaultShort64bitShadowOffset;
+  }
+  if (!ZeroBaseShadow && ClMappingOffsetLog >= 0) {
+    // Zero offset log is the special case.
+    Mapping.Offset = (ClMappingOffsetLog == 0) ? 0 : 1ULL << ClMappingOffsetLog;
+  }
+
+  Mapping.Scale = kDefaultShadowScale;
+  if (ClMappingScale) {
+    Mapping.Scale = ClMappingScale;
+  }
+
+  return Mapping;
 }
 
-static size_t RedzoneSize() {
+static size_t RedzoneSizeForScale(int MappingScale) {
   // Redzone used for stack and globals is at least 32 bytes.
   // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
-  return std::max(32U, 1U << MappingScale());
+  return std::max(32U, 1U << MappingScale);
 }
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
@@ -201,23 +247,27 @@ struct AddressSanitizer : public FunctionPass {
   AddressSanitizer(bool CheckInitOrder = false,
                    bool CheckUseAfterReturn = false,
                    bool CheckLifetime = false,
-                   StringRef BlacklistFile = StringRef())
+                   StringRef BlacklistFile = StringRef(),
+                   bool ZeroBaseShadow = false)
       : FunctionPass(ID),
         CheckInitOrder(CheckInitOrder || ClInitializers),
         CheckUseAfterReturn(CheckUseAfterReturn || ClUseAfterReturn),
         CheckLifetime(CheckLifetime || ClCheckLifetime),
         BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) {}
+                                            : BlacklistFile),
+        ZeroBaseShadow(ZeroBaseShadow) {}
   virtual const char *getPassName() const {
     return "AddressSanitizerFunctionPass";
   }
   void instrumentMop(Instruction *I);
-  void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB,
-                         Value *Addr, uint32_t TypeSize, bool IsWrite);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite,
+                         Value *SizeArgument);
   Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                            Value *ShadowValue, uint32_t TypeSize);
   Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
-                                 bool IsWrite, size_t AccessSizeIndex);
+                                 bool IsWrite, size_t AccessSizeIndex,
+                                 Value *SizeArgument);
   bool instrumentMemIntrinsic(MemIntrinsic *MI);
   void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
                                    Value *Size,
@@ -227,6 +277,7 @@ struct AddressSanitizer : public FunctionPass {
   void createInitializerPoisonCalls(Module &M,
                                     Value *FirstAddr, Value *LastAddr);
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+  void emitShadowMapping(Module &M, IRBuilder<> &IRB) const;
   virtual bool doInitialization(Module &M);
   static char ID;  // Pass identification, replacement for typeid
 
@@ -240,18 +291,22 @@ struct AddressSanitizer : public FunctionPass {
   bool CheckInitOrder;
   bool CheckUseAfterReturn;
   bool CheckLifetime;
+  SmallString<64> BlacklistFile;
+  bool ZeroBaseShadow;
+
   LLVMContext *C;
   DataLayout *TD;
-  uint64_t MappingOffset;
   int LongSize;
   Type *IntptrTy;
+  ShadowMapping Mapping;
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Function *AsanHandleNoReturnFunc;
-  SmallString<64> BlacklistFile;
   OwningPtr<BlackList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
+  // This array is indexed by AccessIsWrite.
+  Function *AsanErrorCallbackSized[2];
   InlineAsm *EmptyAsm;
   SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
 
@@ -261,11 +316,13 @@ struct AddressSanitizer : public FunctionPass {
 class AddressSanitizerModule : public ModulePass {
  public:
   AddressSanitizerModule(bool CheckInitOrder = false,
-                         StringRef BlacklistFile = StringRef())
+                         StringRef BlacklistFile = StringRef(),
+                         bool ZeroBaseShadow = false)
       : ModulePass(ID),
         CheckInitOrder(CheckInitOrder || ClInitializers),
         BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) {}
+                                            : BlacklistFile),
+        ZeroBaseShadow(ZeroBaseShadow) {}
   bool runOnModule(Module &M);
   static char ID;  // Pass identification, replacement for typeid
   virtual const char *getPassName() const {
@@ -278,14 +335,20 @@ class AddressSanitizerModule : public ModulePass {
   bool ShouldInstrumentGlobal(GlobalVariable *G);
   void createInitializerPoisonCalls(Module &M, Value *FirstAddr,
                                     Value *LastAddr);
+  size_t RedzoneSize() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
 
   bool CheckInitOrder;
   SmallString<64> BlacklistFile;
+  bool ZeroBaseShadow;
+
   OwningPtr<BlackList> BL;
   SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
   Type *IntptrTy;
   LLVMContext *C;
   DataLayout *TD;
+  ShadowMapping Mapping;
   Function *AsanPoisonGlobals;
   Function *AsanUnpoisonGlobals;
   Function *AsanRegisterGlobals;
@@ -308,6 +371,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   LLVMContext *C;
   Type *IntptrTy;
   Type *IntptrPtrTy;
+  ShadowMapping Mapping;
 
   SmallVector<AllocaInst*, 16> AllocaVec;
   SmallVector<Instruction*, 8> RetVec;
@@ -332,7 +396,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
       : F(F), ASan(ASan), DIB(*F.getParent()), C(ASan.C),
         IntptrTy(ASan.IntptrTy), IntptrPtrTy(PointerType::get(IntptrTy, 0)),
-        TotalStackSize(0), StackAlignment(1 << MappingScale()) {}
+        Mapping(ASan.Mapping),
+        TotalStackSize(0), StackAlignment(1 << Mapping.Scale) {}
 
   bool runOnFunction() {
     if (!ClStack) return false;
@@ -411,6 +476,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
             AI.getAllocatedType()->isSized());
   }
 
+  size_t RedzoneSize() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
   uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
     Type *Ty = AI->getAllocatedType();
     uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty);
@@ -439,9 +507,9 @@ INITIALIZE_PASS(AddressSanitizer, "asan",
     false, false)
 FunctionPass *llvm::createAddressSanitizerFunctionPass(
     bool CheckInitOrder, bool CheckUseAfterReturn, bool CheckLifetime,
-    StringRef BlacklistFile) {
+    StringRef BlacklistFile, bool ZeroBaseShadow) {
   return new AddressSanitizer(CheckInitOrder, CheckUseAfterReturn,
-                              CheckLifetime, BlacklistFile);
+                              CheckLifetime, BlacklistFile, ZeroBaseShadow);
 }
 
 char AddressSanitizerModule::ID = 0;
@@ -449,8 +517,9 @@ INITIALIZE_PASS(AddressSanitizerModule, "asan-module",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
     "ModulePass", false, false)
 ModulePass *llvm::createAddressSanitizerModulePass(
-    bool CheckInitOrder, StringRef BlacklistFile) {
-  return new AddressSanitizerModule(CheckInitOrder, BlacklistFile);
+    bool CheckInitOrder, StringRef BlacklistFile, bool ZeroBaseShadow) {
+  return new AddressSanitizerModule(CheckInitOrder, BlacklistFile,
+                                    ZeroBaseShadow);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -473,32 +542,30 @@ static bool GlobalWasGeneratedByAsan(GlobalVariable *G) {
 
 Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
   // Shadow >> scale
-  Shadow = IRB.CreateLShr(Shadow, MappingScale());
-  if (MappingOffset == 0)
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  if (Mapping.Offset == 0)
     return Shadow;
   // (Shadow >> scale) | offset
-  return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy,
-                                               MappingOffset));
+  if (Mapping.OrShadowOffset)
+    return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
+  else
+    return IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
 }
 
 void AddressSanitizer::instrumentMemIntrinsicParam(
     Instruction *OrigIns,
     Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
+  IRBuilder<> IRB(InsertBefore);
+  if (Size->getType() != IntptrTy)
+    Size = IRB.CreateIntCast(Size, IntptrTy, false);
   // Check the first byte.
-  {
-    IRBuilder<> IRB(InsertBefore);
-    instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite);
-  }
+  instrumentAddress(OrigIns, InsertBefore, Addr, 8, IsWrite, Size);
   // Check the last byte.
-  {
-    IRBuilder<> IRB(InsertBefore);
-    Value *SizeMinusOne = IRB.CreateSub(
-        Size, ConstantInt::get(Size->getType(), 1));
-    SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false);
-    Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
-    Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne);
-    instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
-  }
+  IRB.SetInsertPoint(InsertBefore);
+  Value *SizeMinusOne = IRB.CreateSub(Size, ConstantInt::get(IntptrTy, 1));
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *AddrLast = IRB.CreateAdd(AddrLong, SizeMinusOne);
+  instrumentAddress(OrigIns, InsertBefore, AddrLast, 8, IsWrite, Size);
 }
 
 // Instrument memset/memmove/memcpy
@@ -577,14 +644,24 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
   assert(OrigTy->isSized());
   uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
 
-  if (TypeSize != 8  && TypeSize != 16 &&
-      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
-    // Ignore all unusual sizes.
-    return;
-  }
+  assert((TypeSize % 8) == 0);
 
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check.
+  if (TypeSize == 8  || TypeSize == 16 ||
+      TypeSize == 32 || TypeSize == 64 || TypeSize == 128)
+    return instrumentAddress(I, I, Addr, TypeSize, IsWrite, 0);
+  // Instrument unusual size (but still multiple of 8).
+  // We can not do it with a single check, so we do 1-byte check for the first
+  // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+  // to report the actual access size.
   IRBuilder<> IRB(I);
-  instrumentAddress(I, IRB, Addr, TypeSize, IsWrite);
+  Value *LastByte =  IRB.CreateIntToPtr(
+      IRB.CreateAdd(IRB.CreatePointerCast(Addr, IntptrTy),
+                    ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+      OrigPtrTy);
+  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+  instrumentAddress(I, I, Addr, 8, IsWrite, Size);
+  instrumentAddress(I, I, LastByte, 8, IsWrite, Size);
 }
 
 // Validate the result of Module::getOrInsertFunction called for an interface
@@ -600,10 +677,12 @@ static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
 
 Instruction *AddressSanitizer::generateCrashCode(
     Instruction *InsertBefore, Value *Addr,
-    bool IsWrite, size_t AccessSizeIndex) {
+    bool IsWrite, size_t AccessSizeIndex, Value *SizeArgument) {
   IRBuilder<> IRB(InsertBefore);
-  CallInst *Call = IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex],
-                                  Addr);
+  CallInst *Call = SizeArgument
+    ? IRB.CreateCall2(AsanErrorCallbackSized[IsWrite], Addr, SizeArgument)
+    : IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr);
+
   // We don't do Call->setDoesNotReturn() because the BB already has
   // UnreachableInst at the end.
   // This EmptyAsm is required to avoid callback merge.
@@ -614,7 +693,7 @@ Instruction *AddressSanitizer::generateCrashCode(
 Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                                             Value *ShadowValue,
                                             uint32_t TypeSize) {
-  size_t Granularity = 1 << MappingScale();
+  size_t Granularity = 1 << Mapping.Scale;
   // Addr & (Granularity - 1)
   Value *LastAccessedByte = IRB.CreateAnd(
       AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
@@ -630,12 +709,14 @@ Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
 }
 
 void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
-                                         IRBuilder<> &IRB, Value *Addr,
-                                         uint32_t TypeSize, bool IsWrite) {
+                                         Instruction *InsertBefore,
+                                         Value *Addr, uint32_t TypeSize,
+                                         bool IsWrite, Value *SizeArgument) {
+  IRBuilder<> IRB(InsertBefore);
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
 
   Type *ShadowTy  = IntegerType::get(
-      *C, std::max(8U, TypeSize >> MappingScale()));
+      *C, std::max(8U, TypeSize >> Mapping.Scale));
   Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   Value *CmpVal = Constant::getNullValue(ShadowTy);
@@ -644,7 +725,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
   size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
-  size_t Granularity = 1 << MappingScale();
+  size_t Granularity = 1 << Mapping.Scale;
   TerminatorInst *CrashTerm = 0;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
@@ -663,8 +744,8 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     CrashTerm = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), true);
   }
 
-  Instruction *Crash =
-      generateCrashCode(CrashTerm, AddrLong, IsWrite, AccessSizeIndex);
+  Instruction *Crash = generateCrashCode(
+      CrashTerm, AddrLong, IsWrite, AccessSizeIndex, SizeArgument);
   Crash->setDebugLoc(OrigIns->getDebugLoc());
 }
 
@@ -782,7 +863,9 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   BL.reset(new BlackList(BlacklistFile));
   if (BL->isIn(M)) return false;
   C = &(M.getContext());
-  IntptrTy = Type::getIntNTy(*C, TD->getPointerSizeInBits());
+  int LongSize = TD->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow);
   initializeCallbacks(M);
   DynamicallyInitializedGlobals.Init(M);
 
@@ -819,12 +902,22 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   Value *FirstDynamic = 0, *LastDynamic = 0;
 
   for (size_t i = 0; i < n; i++) {
+    static const uint64_t kMaxGlobalRedzone = 1 << 18;
     GlobalVariable *G = GlobalsToChange[i];
     PointerType *PtrTy = cast<PointerType>(G->getType());
     Type *Ty = PtrTy->getElementType();
     uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
-    size_t RZ = RedzoneSize();
-    uint64_t RightRedzoneSize = RZ + (RZ - (SizeInBytes % RZ));
+    uint64_t MinRZ = RedzoneSize();
+    // MinRZ <= RZ <= kMaxGlobalRedzone
+    // and trying to make RZ to be ~ 1/4 of SizeInBytes.
+    uint64_t RZ = std::max(MinRZ,
+                         std::min(kMaxGlobalRedzone,
+                                  (SizeInBytes / MinRZ / 4) * MinRZ));
+    uint64_t RightRedzoneSize = RZ;
+    // Round up to MinRZ
+    if (SizeInBytes % MinRZ)
+      RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
+    assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
     // Determine whether this global should be poisoned in initialization.
     bool GlobalHasDynamicInitializer =
@@ -848,7 +941,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
         M, NewTy, G->isConstant(), G->getLinkage(),
         NewInitializer, "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
-    NewGlobal->setAlignment(RZ);
+    NewGlobal->setAlignment(MinRZ);
 
     Value *Indices2[2];
     Indices2[0] = IRB.getInt32(0);
@@ -921,6 +1014,10 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
               FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
     }
   }
+  AsanErrorCallbackSized[0] = checkInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportLoadN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanErrorCallbackSized[1] = checkInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
 
   AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
@@ -930,6 +1027,23 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
                             /*hasSideEffects=*/true);
 }
 
+void AddressSanitizer::emitShadowMapping(Module &M, IRBuilder<> &IRB) const {
+  // Tell the values of mapping offset and scale to the run-time.
+  GlobalValue *asan_mapping_offset =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                     ConstantInt::get(IntptrTy, Mapping.Offset),
+                     kAsanMappingOffsetName);
+  // Read the global, otherwise it may be optimized away.
+  IRB.CreateLoad(asan_mapping_offset, true);
+
+  GlobalValue *asan_mapping_scale =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                         ConstantInt::get(IntptrTy, Mapping.Scale),
+                         kAsanMappingScaleName);
+  // Read the global, otherwise it may be optimized away.
+  IRB.CreateLoad(asan_mapping_scale, true);
+}
+
 // virtual
 bool AddressSanitizer::doInitialization(Module &M) {
   // Initialize the private fields. No one has accessed them before.
@@ -955,41 +1069,10 @@ bool AddressSanitizer::doInitialization(Module &M) {
   AsanInitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(AsanInitFunction);
 
-  llvm::Triple targetTriple(M.getTargetTriple());
-  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::Android;
-
-  MappingOffset = isAndroid ? kDefaultShadowOffsetAndroid :
-    (LongSize == 32 ? kDefaultShadowOffset32 : kDefaultShadowOffset64);
-  if (ClMappingOffsetLog >= 0) {
-    if (ClMappingOffsetLog == 0) {
-      // special case
-      MappingOffset = 0;
-    } else {
-      MappingOffset = 1ULL << ClMappingOffsetLog;
-    }
-  }
-
-
-  if (ClMappingOffsetLog >= 0) {
-    // Tell the run-time the current values of mapping offset and scale.
-    GlobalValue *asan_mapping_offset =
-        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
-                       ConstantInt::get(IntptrTy, MappingOffset),
-                       kAsanMappingOffsetName);
-    // Read the global, otherwise it may be optimized away.
-    IRB.CreateLoad(asan_mapping_offset, true);
-  }
-  if (ClMappingScale) {
-    GlobalValue *asan_mapping_scale =
-        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
-                           ConstantInt::get(IntptrTy, MappingScale()),
-                           kAsanMappingScaleName);
-    // Read the global, otherwise it may be optimized away.
-    IRB.CreateLoad(asan_mapping_scale, true);
-  }
+  Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow);
+  emitShadowMapping(M, IRB);
 
   appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority);
-
   return true;
 }
 
@@ -1015,11 +1098,11 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
   initializeCallbacks(*F.getParent());
 
-  // If needed, insert __asan_init before checking for AddressSafety attr.
+  // If needed, insert __asan_init before checking for SanitizeAddress attr.
   maybeInsertAsanInitAtFunctionEntry(F);
 
   if (!F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::AddressSafety))
+                                      Attribute::SanitizeAddress))
     return false;
 
   if (!ClDebugFunc.empty() && ClDebugFunc != F.getName())
@@ -1048,12 +1131,12 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) {
         // ok, take it.
       } else {
-        if (CallInst *CI = dyn_cast<CallInst>(BI)) {
+        CallSite CS(BI);
+        if (CS) {
           // A call inside BB.
           TempsToInstrument.clear();
-          if (CI->doesNotReturn()) {
-            NoReturnCalls.push_back(CI);
-          }
+          if (CS.doesNotReturn())
+            NoReturnCalls.push_back(CS.getInstruction());
         }
         continue;
       }
@@ -1147,7 +1230,7 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
 void FunctionStackPoisoner::poisonRedZones(
   const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase,
   bool DoPoison) {
-  size_t ShadowRZSize = RedzoneSize() >> MappingScale();
+  size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale;
   assert(ShadowRZSize >= 1 && ShadowRZSize <= 4);
   Type *RZTy = Type::getIntNTy(*C, ShadowRZSize * 8);
   Type *RZPtrTy = PointerType::get(RZTy, 0);
@@ -1178,13 +1261,13 @@ void FunctionStackPoisoner::poisonRedZones(
       // Poison the partial redzone at right
       Ptr = IRB.CreateAdd(
           ShadowBase, ConstantInt::get(IntptrTy,
-                                       (Pos >> MappingScale()) - ShadowRZSize));
+                                       (Pos >> Mapping.Scale) - ShadowRZSize));
       size_t AddressableBytes = RedzoneSize() - (AlignedSize - SizeInBytes);
       uint32_t Poison = 0;
       if (DoPoison) {
         PoisonShadowPartialRightRedzone((uint8_t*)&Poison, AddressableBytes,
                                         RedzoneSize(),
-                                        1ULL << MappingScale(),
+                                        1ULL << Mapping.Scale,
                                         kAsanStackPartialRedzoneMagic);
       }
       Value *PartialPoison = ConstantInt::get(RZTy, Poison);
@@ -1193,7 +1276,7 @@ void FunctionStackPoisoner::poisonRedZones(
 
     // Poison the full redzone at right.
     Ptr = IRB.CreateAdd(ShadowBase,
-                        ConstantInt::get(IntptrTy, Pos >> MappingScale()));
+                        ConstantInt::get(IntptrTy, Pos >> Mapping.Scale));
     bool LastAlloca = (i == AllocaVec.size() - 1);
     Value *Poison = LastAlloca ? PoisonRight : PoisonMid;
     IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
index 4fcbea4..927982d 100644
--- a/lib/Transforms/Instrumentation/BlackList.cpp
+++ b/lib/Transforms/Instrumentation/BlackList.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BlackList.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -78,21 +78,21 @@ BlackList::BlackList(const StringRef Path) {
   }
 
   // Iterate through each of the prefixes, and create Regexs for them.
-  for (StringMap<std::string>::iterator I = Regexps.begin(), E = Regexps.end();
-       I != E; ++I) {
+  for (StringMap<std::string>::const_iterator I = Regexps.begin(),
+       E = Regexps.end(); I != E; ++I) {
     Entries[I->getKey()] = new Regex(I->getValue());
   }
 }
 
-bool BlackList::isIn(const Function &F) {
+bool BlackList::isIn(const Function &F) const {
   return isIn(*F.getParent()) || inSection("fun", F.getName());
 }
 
-bool BlackList::isIn(const GlobalVariable &G) {
+bool BlackList::isIn(const GlobalVariable &G) const {
   return isIn(*G.getParent()) || inSection("global", G.getName());
 }
 
-bool BlackList::isIn(const Module &M) {
+bool BlackList::isIn(const Module &M) const {
   return inSection("src", M.getModuleIdentifier());
 }
 
@@ -107,14 +107,15 @@ static StringRef GetGVTypeString(const GlobalVariable &G) {
   return "<unknown type>";
 }
 
-bool BlackList::isInInit(const GlobalVariable &G) {
+bool BlackList::isInInit(const GlobalVariable &G) const {
   return (isIn(*G.getParent()) ||
           inSection("global-init", G.getName()) ||
           inSection("global-init-type", GetGVTypeString(G)));
 }
 
-bool BlackList::inSection(const StringRef Section, const StringRef Query) {
-  StringMap<Regex*>::iterator I = Entries.find(Section);
+bool BlackList::inSection(const StringRef Section,
+                          const StringRef Query) const {
+  StringMap<Regex*>::const_iterator I = Entries.find(Section);
   if (I == Entries.end()) return false;
 
   Regex *FunctionRegex = I->getValue();
diff --git a/lib/Transforms/Instrumentation/BlackList.h b/lib/Transforms/Instrumentation/BlackList.h
deleted file mode 100644
index ee18a98..0000000
--- a/lib/Transforms/Instrumentation/BlackList.h
+++ /dev/null
@@ -1,58 +0,0 @@
-//===-- BlackList.h - blacklist for sanitizers ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer
-// or ThreadSanitizer) to avoid instrumenting some functions or global
-// variables based on a user-supplied blacklist.
-//
-// The blacklist disables instrumentation of various functions and global
-// variables.  Each line contains a prefix, followed by a wild card expression.
-// Empty lines and lines starting with "#" are ignored.
-// ---
-// # Blacklisted items:
-// fun:*_ZN4base6subtle*
-// global:*global_with_bad_access_or_initialization*
-// global-init:*global_with_initialization_issues*
-// global-init-type:*Namespace::ClassName*
-// src:file_with_tricky_code.cc
-// ---
-// Note that the wild card is in fact an llvm::Regex, but * is automatically
-// replaced with .*
-// This is similar to the "ignore" feature of ThreadSanitizer.
-// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include "llvm/ADT/StringMap.h"
-
-namespace llvm {
-class Function;
-class GlobalVariable;
-class Module;
-class Regex;
-class StringRef;
-
-class BlackList {
- public:
-  BlackList(const StringRef Path);
-  // Returns whether either this function or it's source file are blacklisted.
-  bool isIn(const Function &F);
-  // Returns whether either this global or it's source file are blacklisted.
-  bool isIn(const GlobalVariable &G);
-  // Returns whether this module is blacklisted by filename.
-  bool isIn(const Module &M);
-  // Returns whether a global should be excluded from initialization checking.
-  bool isInInit(const GlobalVariable &G);
- private:
-  StringMap<Regex*> Entries;
-
-  bool inSection(const StringRef Section, const StringRef Query);
-};
-
-}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
index 0b18b4c..a2459fb 100644
--- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "ProfilingUtils.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
@@ -55,8 +54,8 @@ ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
 bool EdgeProfiler::runOnModule(Module &M) {
   Function *Main = M.getFunction("main");
   if (Main == 0) {
-    M.getContext().emitWarning("cannot insert edge profiling into a module"
-                               " with no main function");
+    errs() << "WARNING: cannot insert edge profiling into a module"
+           << " with no main function!\n";
     return false;  // No main, no instrumentation!
   }
 
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 58d5801..80705af 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -71,7 +71,6 @@
 #define DEBUG_TYPE "msan"
 
 #include "llvm/Transforms/Instrumentation.h"
-#include "BlackList.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -91,6 +90,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
@@ -127,6 +127,10 @@ static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
        cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
        cl::Hidden, cl::init(true));
 
+static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
+       cl::desc("exact handling of relational integer ICmp"),
+       cl::Hidden, cl::init(false));
+
 static cl::opt<bool> ClStoreCleanOrigin("msan-store-clean-origin",
        cl::desc("store origin for clean (fully initialized) values"),
        cl::Hidden, cl::init(false));
@@ -361,6 +365,9 @@ bool MemorySanitizer::doInitialization(Module &M) {
   new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
                      IRB.getInt32(TrackOrigins), "__msan_track_origins");
 
+  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                     IRB.getInt32(ClKeepGoing), "__msan_keep_going");
+
   return true;
 }
 
@@ -451,9 +458,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
       DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
       (void)NewSI;
-      // If the store is volatile, add a check.
-      if (I.isVolatile())
-        insertCheck(Val, &I);
+
       if (ClCheckAccessAddress)
         insertCheck(Addr, &I);
 
@@ -574,7 +579,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
       return IT;
     if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
-      uint32_t EltSize = MS.TD->getTypeStoreSizeInBits(VT->getElementType());
+      uint32_t EltSize = MS.TD->getTypeSizeInBits(VT->getElementType());
       return VectorType::get(IntegerType::get(*MS.C, EltSize),
                              VT->getNumElements());
     }
@@ -586,7 +591,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
       return Res;
     }
-    uint32_t TypeSize = MS.TD->getTypeStoreSizeInBits(OrigTy);
+    uint32_t TypeSize = MS.TD->getTypeSizeInBits(OrigTy);
     return IntegerType::get(*MS.C, TypeSize);
   }
 
@@ -847,7 +852,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// Stores the corresponding shadow and (optionally) origin.
   /// Optionally, checks that the store address is fully defined.
-  /// Volatile stores check that the value being stored is fully defined.
   void visitStoreInst(StoreInst &I) {
     StoreList.push_back(&I);
   }
@@ -1127,10 +1131,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *B = I.getOperand(1);
     Value *Sa = getShadow(A);
     Value *Sb = getShadow(B);
-    if (A->getType()->isPointerTy())
-      A = IRB.CreatePointerCast(A, MS.IntptrTy);
-    if (B->getType()->isPointerTy())
-      B = IRB.CreatePointerCast(B, MS.IntptrTy);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
     // A == B  <==>  (C = A^B) == 0
     // A != B  <==>  (C = A^B) != 0
     // Sc = Sa | Sb
@@ -1152,6 +1159,73 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  /// \brief Build the lowest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Maximise the undefined shadow bit, minimize other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
+    } else {
+      // Minimize undefined bits.
+      return IRB.CreateAnd(A, IRB.CreateNot(Sa));
+    }
+  }
+
+  /// \brief Build the highest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Minimise the undefined shadow bit, maximise other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
+    } else {
+      // Maximize undefined bits.
+      return IRB.CreateOr(A, Sa);
+    }
+  }
+
+  /// \brief Instrument relational comparisons.
+  ///
+  /// This function does exact shadow propagation for all relational
+  /// comparisons of integers, pointers and vectors of those.
+  /// FIXME: output seems suboptimal when one of the operands is a constant
+  void handleRelationalComparisonExact(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // Let [a0, a1] be the interval of possible values of A, taking into account
+    // its undefined bits. Let [b0, b1] be the interval of possible values of B.
+    // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
+    bool IsSigned = I.isSigned();
+    Value *S1 = IRB.CreateICmp(I.getPredicate(),
+                               getLowestPossibleValue(IRB, A, Sa, IsSigned),
+                               getHighestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *S2 = IRB.CreateICmp(I.getPredicate(),
+                               getHighestPossibleValue(IRB, A, Sa, IsSigned),
+                               getLowestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *Si = IRB.CreateXor(S1, S2);
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
   /// \brief Instrument signed relational comparisons.
   ///
   /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by
@@ -1181,12 +1255,32 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitICmpInst(ICmpInst &I) {
-    if (ClHandleICmp && I.isEquality())
+    if (!ClHandleICmp) {
+      handleShadowOr(I);
+      return;
+    }
+    if (I.isEquality()) {
       handleEqualityComparison(I);
-    else if (ClHandleICmp && I.isSigned() && I.isRelational())
+      return;
+    }
+
+    assert(I.isRelational());
+    if (ClHandleICmpExact) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+    if (I.isSigned()) {
       handleSignedRelationalComparison(I);
-    else
-      handleShadowOr(I);
+      return;
+    }
+
+    assert(I.isUnsigned());
+    if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+
+    handleShadowOr(I);
   }
 
   void visitFCmpInst(FCmpInst &I) {
@@ -1458,8 +1552,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         AttrBuilder B;
         B.addAttribute(Attribute::ReadOnly)
           .addAttribute(Attribute::ReadNone);
-        Func->removeAttribute(AttributeSet::FunctionIndex,
-                              Attribute::get(Func->getContext(), B));
+        Func->removeAttributes(AttributeSet::FunctionIndex,
+                               AttributeSet::get(Func->getContext(),
+                                                 AttributeSet::FunctionIndex,
+                                                 B));
       }
     }
     IRBuilder<> IRB(&I);
@@ -1498,6 +1594,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       if (MS.TrackOrigins)
         IRB.CreateStore(getOrigin(A),
                         getOriginPtrForArgument(A, IRB, ArgOffset));
+      (void)Store;
       assert(Size != 0 && Store != 0);
       DEBUG(dbgs() << "  Param:" << *Store << "\n");
       ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
@@ -1774,7 +1871,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
     // Unpoison the whole __va_list_tag.
     // FIXME: magic ABI constants.
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
-                     /* size */24, /* alignment */16, false);
+                     /* size */24, /* alignment */8, false);
   }
 
   void visitVACopyInst(VACopyInst &I) {
@@ -1785,7 +1882,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
     // Unpoison the whole __va_list_tag.
     // FIXME: magic ABI constants.
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
-                     /* size */ 24, /* alignment */ 16, false);
+                     /* size */24, /* alignment */8, false);
   }
 
   void finalizeInstrumentation() {
@@ -1850,8 +1947,9 @@ bool MemorySanitizer::runOnFunction(Function &F) {
   AttrBuilder B;
   B.addAttribute(Attribute::ReadOnly)
     .addAttribute(Attribute::ReadNone);
-  F.removeAttribute(AttributeSet::FunctionIndex,
-                    Attribute::get(F.getContext(), B));
+  F.removeAttributes(AttributeSet::FunctionIndex,
+                     AttributeSet::get(F.getContext(),
+                                       AttributeSet::FunctionIndex, B));
 
   return Visitor.runOnFunction();
 }
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index c5a1fe9..b45aef6 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ProfileInfoLoader.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -76,8 +75,8 @@ inline static void printEdgeCounter(ProfileInfo::Edge e,
 bool OptimalEdgeProfiler::runOnModule(Module &M) {
   Function *Main = M.getFunction("main");
   if (Main == 0) {
-    M.getContext().emitWarning("cannot insert edge profiling into a module"
-                               " with no main function");
+    errs() << "WARNING: cannot insert edge profiling into a module"
+           << " with no main function!\n";
     return false;  // No main, no instrumentation!
   }
 
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index 358bbeb..7de7326 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -1345,8 +1345,8 @@ bool PathProfiler::runOnModule(Module &M) {
     Main = M.getFunction("MAIN__");
 
   if (!Main) {
-    Context->emitWarning("cannot insert edge profiling into a module"
-                         " with no main function");
+    errs() << "WARNING: cannot insert path profiling into a module"
+           << " with no main function!\n";
     return false;
   }
 
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 29d2ece..f93c5ab 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -22,7 +22,6 @@
 #define DEBUG_TYPE "tsan"
 
 #include "llvm/Transforms/Instrumentation.h"
-#include "BlackList.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -41,6 +40,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/LLVMBuild.txt b/lib/Transforms/LLVMBuild.txt
index f7bca06..15e9fba 100644
--- a/lib/Transforms/LLVMBuild.txt
+++ b/lib/Transforms/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize
+subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC
 
 [component_0]
 type = Group
diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile
index 8b1df92..c390517 100644
--- a/lib/Transforms/Makefile
+++ b/lib/Transforms/Makefile
@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../..
-PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello
+PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello ObjCARC
 
 include $(LEVEL)/Makefile.config
 
diff --git a/lib/Transforms/ObjCARC/CMakeLists.txt b/lib/Transforms/ObjCARC/CMakeLists.txt
new file mode 100644
index 0000000..233deb3
--- /dev/null
+++ b/lib/Transforms/ObjCARC/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_library(LLVMObjCARCOpts
+  ObjCARC.cpp
+  ObjCARCOpts.cpp
+  ObjCARCExpand.cpp
+  ObjCARCAPElim.cpp
+  ObjCARCAliasAnalysis.cpp
+  ObjCARCUtil.cpp
+  ObjCARCContract.cpp
+  DependencyAnalysis.cpp
+  ProvenanceAnalysis.cpp
+  )
+
+add_dependencies(LLVMObjCARCOpts intrinsics_gen)
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
new file mode 100644
index 0000000..5aada9c
--- /dev/null
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -0,0 +1,261 @@
+//===- DependencyAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-dependency"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/Support/CFG.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool
+llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                                ProvenanceAnalysis &PA,
+                                InstructionClass Class) {
+  switch (Class) {
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_User:
+    // These operations never directly modify a reference count.
+    return false;
+  default: break;
+  }
+
+  ImmutableCallSite CS = static_cast<const Value *>(Inst);
+  assert(CS && "Only calls can alter reference counts!");
+
+  // See if AliasAnalysis can help us with the call.
+  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
+  if (AliasAnalysis::onlyReadsMemory(MRB))
+    return false;
+  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+    for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+         I != E; ++I) {
+      const Value *Op = *I;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  }
+
+  // Assume the worst.
+  return true;
+}
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool
+llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
+                      ProvenanceAnalysis &PA, InstructionClass Class) {
+  // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers.
+  if (Class == IC_Call)
+    return false;
+
+  // Consider various instructions which may have pointer arguments which are
+  // not "uses".
+  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
+    // Comparing a pointer with null, or any other constant, isn't really a use,
+    // because we don't care what the pointer points to, or about the values
+    // of any other dynamic reference-counted pointers.
+    if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA()))
+      return false;
+  } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
+    // For calls, just check the arguments (and not the callee operand).
+    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
+         OE = CS.arg_end(); OI != OE; ++OI) {
+      const Value *Op = *OI;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    // Special-case stores, because we don't care about the stored value, just
+    // the store address.
+    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
+    // If we can't tell what the underlying object was, assume there is a
+    // dependence.
+    return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Op, Ptr);
+  }
+
+  // Check each operand for a match.
+  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
+       OI != OE; ++OI) {
+    const Value *Op = *OI;
+    if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+      return true;
+  }
+  return false;
+}
+
+/// Test if there can be dependencies on Inst through Arg. This function only
+/// tests dependencies relevant for removing pairs of calls.
+bool
+llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
+                       const Value *Arg, ProvenanceAnalysis &PA) {
+  // If we've reached the definition of Arg, stop.
+  if (Inst == Arg)
+    return true;
+
+  switch (Flavor) {
+  case NeedsPositiveRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanUse(Inst, Arg, PA, Class);
+    }
+  }
+
+  case AutoreleasePoolBoundary: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // These mark the end and begin of an autorelease pool scope.
+      return true;
+    default:
+      // Nothing else does this.
+      return false;
+    }
+  }
+
+  case CanChangeRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+      // Conservatively assume this can decrement any count.
+      return true;
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanAlterRefCount(Inst, Arg, PA, Class);
+    }
+  }
+
+  case RetainAutoreleaseDep:
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // Don't merge an objc_autorelease with an objc_retain inside a different
+      // autoreleasepool scope.
+      return true;
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Nothing else matters for objc_retainAutorelease formation.
+      return false;
+    }
+
+  case RetainAutoreleaseRVDep: {
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Anything that can autorelease interrupts
+      // retainAutoreleaseReturnValue formation.
+      return CanInterruptRV(Class);
+    }
+  }
+
+  case RetainRVDep:
+    return CanInterruptRV(GetBasicInstructionClass(Inst));
+  }
+
+  llvm_unreachable("Invalid dependence flavor");
+}
+
+/// Walk up the CFG from StartPos (which is in StartBB) and find local and
+/// non-local dependencies on Arg.
+///
+/// TODO: Cache results?
+void
+llvm::objcarc::FindDependencies(DependenceKind Flavor,
+                                const Value *Arg,
+                                BasicBlock *StartBB, Instruction *StartInst,
+                                SmallPtrSet<Instruction *, 4> &DependingInsts,
+                                SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                ProvenanceAnalysis &PA) {
+  BasicBlock::iterator StartPos = StartInst;
+
+  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
+  Worklist.push_back(std::make_pair(StartBB, StartPos));
+  do {
+    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
+      Worklist.pop_back_val();
+    BasicBlock *LocalStartBB = Pair.first;
+    BasicBlock::iterator LocalStartPos = Pair.second;
+    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
+    for (;;) {
+      if (LocalStartPos == StartBBBegin) {
+        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
+        if (PI == PE)
+          // If we've reached the function entry, produce a null dependence.
+          DependingInsts.insert(0);
+        else
+          // Add the predecessors to the worklist.
+          do {
+            BasicBlock *PredBB = *PI;
+            if (Visited.insert(PredBB))
+              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
+          } while (++PI != PE);
+        break;
+      }
+
+      Instruction *Inst = --LocalStartPos;
+      if (Depends(Flavor, Inst, Arg, PA)) {
+        DependingInsts.insert(Inst);
+        break;
+      }
+    }
+  } while (!Worklist.empty());
+
+  // Determine whether the original StartBB post-dominates all of the blocks we
+  // visited. If not, insert a sentinal indicating that most optimizations are
+  // not safe.
+  for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(),
+       E = Visited.end(); I != E; ++I) {
+    const BasicBlock *BB = *I;
+    if (BB == StartBB)
+      continue;
+    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+      const BasicBlock *Succ = *SI;
+      if (Succ != StartBB && !Visited.count(Succ)) {
+        DependingInsts.insert(reinterpret_cast<Instruction *>(-1));
+        return;
+      }
+    }
+  }
+}
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.h b/lib/Transforms/ObjCARC/DependencyAnalysis.h
new file mode 100644
index 0000000..24d358b
--- /dev/null
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -0,0 +1,79 @@
+//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+
+namespace llvm {
+  class BasicBlock;
+  class Instruction;
+  class Value;
+}
+
+namespace llvm {
+namespace objcarc {
+
+class ProvenanceAnalysis;
+
+/// \enum DependenceKind
+/// \brief Defines different dependence kinds among various ARC constructs.
+///
+/// There are several kinds of dependence-like concepts in use here.
+///
+enum DependenceKind {
+  NeedsPositiveRetainCount,
+  AutoreleasePoolBoundary,
+  CanChangeRetainCount,
+  RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
+  RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
+  RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
+};
+
+void FindDependencies(DependenceKind Flavor,
+                      const Value *Arg,
+                      BasicBlock *StartBB, Instruction *StartInst,
+                      SmallPtrSet<Instruction *, 4> &DependingInstructions,
+                      SmallPtrSet<const BasicBlock *, 4> &Visited,
+                      ProvenanceAnalysis &PA);
+
+bool
+Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
+        ProvenanceAnalysis &PA);
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool
+CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
+       InstructionClass Class);
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool
+CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                 ProvenanceAnalysis &PA, InstructionClass Class);
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
diff --git a/lib/Transforms/ObjCARC/LLVMBuild.txt b/lib/Transforms/ObjCARC/LLVMBuild.txt
new file mode 100644
index 0000000..90a2338
--- /dev/null
+++ b/lib/Transforms/ObjCARC/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Transforms/ObjCARC/LLVMBuild.txt -------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ObjCARC
+parent = Transforms
+library_name = ObjCARCOpts
+required_libraries = Analysis Core Support TransformUtils
diff --git a/lib/Transforms/ObjCARC/Makefile b/lib/Transforms/ObjCARC/Makefile
new file mode 100644
index 0000000..2a34e21
--- /dev/null
+++ b/lib/Transforms/ObjCARC/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/ObjCARC/Makefile ---------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMObjCARCOpts
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/ObjCARC/ObjCARC.cpp b/lib/Transforms/ObjCARC/ObjCARC.cpp
new file mode 100644
index 0000000..53a31b0
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -0,0 +1,48 @@
+//===-- ObjCARC.cpp -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMObjCARCOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+  class PassRegistry;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// \brief A handy option to enable/disable all ARC Optimizations.
+bool llvm::objcarc::EnableARCOpts;
+static cl::opt<bool, true>
+EnableARCOptimizations("enable-objc-arc-opts",
+                       cl::location(EnableARCOpts),
+                       cl::init(true));
+
+/// initializeObjCARCOptsPasses - Initialize all passes linked into the
+/// ObjCARCOpts library.
+void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
+  initializeObjCARCAliasAnalysisPass(Registry);
+  initializeObjCARCAPElimPass(Registry);
+  initializeObjCARCExpandPass(Registry);
+  initializeObjCARCContractPass(Registry);
+  initializeObjCARCOptPass(Registry);
+}
+
+void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) {
+  initializeObjCARCOpts(*unwrap(R));
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
new file mode 100644
index 0000000..e062b66
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARC.h
@@ -0,0 +1,389 @@
+//===- ObjCARC.h - ObjC ARC Optimization --------------*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines common definitions/declarations used by the ObjC ARC
+/// Optimizer. ARC stands for Automatic Reference Counting and is a system for
+/// managing reference counts for objects in Objective C.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+#define LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Transforms/ObjCARC.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief A handy option to enable/disable all ARC Optimizations.
+extern bool EnableARCOpts;
+
+/// \brief Test if the given module looks interesting to run ARC optimization
+/// on.
+static inline bool ModuleHasARC(const Module &M) {
+  return
+    M.getNamedValue("objc_retain") ||
+    M.getNamedValue("objc_release") ||
+    M.getNamedValue("objc_autorelease") ||
+    M.getNamedValue("objc_retainAutoreleasedReturnValue") ||
+    M.getNamedValue("objc_retainBlock") ||
+    M.getNamedValue("objc_autoreleaseReturnValue") ||
+    M.getNamedValue("objc_autoreleasePoolPush") ||
+    M.getNamedValue("objc_loadWeakRetained") ||
+    M.getNamedValue("objc_loadWeak") ||
+    M.getNamedValue("objc_destroyWeak") ||
+    M.getNamedValue("objc_storeWeak") ||
+    M.getNamedValue("objc_initWeak") ||
+    M.getNamedValue("objc_moveWeak") ||
+    M.getNamedValue("objc_copyWeak") ||
+    M.getNamedValue("objc_retainedObject") ||
+    M.getNamedValue("objc_unretainedObject") ||
+    M.getNamedValue("objc_unretainedPointer");
+}
+
+/// \enum InstructionClass
+/// \brief A simple classification for instructions.
+enum InstructionClass {
+  IC_Retain,              ///< objc_retain
+  IC_RetainRV,            ///< objc_retainAutoreleasedReturnValue
+  IC_RetainBlock,         ///< objc_retainBlock
+  IC_Release,             ///< objc_release
+  IC_Autorelease,         ///< objc_autorelease
+  IC_AutoreleaseRV,       ///< objc_autoreleaseReturnValue
+  IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush
+  IC_AutoreleasepoolPop,  ///< objc_autoreleasePoolPop
+  IC_NoopCast,            ///< objc_retainedObject, etc.
+  IC_FusedRetainAutorelease, ///< objc_retainAutorelease
+  IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
+  IC_LoadWeakRetained,    ///< objc_loadWeakRetained (primitive)
+  IC_StoreWeak,           ///< objc_storeWeak (primitive)
+  IC_InitWeak,            ///< objc_initWeak (derived)
+  IC_LoadWeak,            ///< objc_loadWeak (derived)
+  IC_MoveWeak,            ///< objc_moveWeak (derived)
+  IC_CopyWeak,            ///< objc_copyWeak (derived)
+  IC_DestroyWeak,         ///< objc_destroyWeak (derived)
+  IC_StoreStrong,         ///< objc_storeStrong (derived)
+  IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
+  IC_Call,                ///< could call objc_release
+  IC_User,                ///< could "use" a pointer
+  IC_None                 ///< anything else
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const InstructionClass Class);
+
+/// \brief Test if the given class is objc_retain or equivalent.
+static inline bool IsRetain(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV;
+}
+
+/// \brief Test if the given class is objc_autorelease or equivalent.
+static inline bool IsAutorelease(InstructionClass Class) {
+  return Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// \brief Test if the given class represents instructions which return their
+/// argument verbatim.
+static inline bool IsForwarding(InstructionClass Class) {
+  // objc_retainBlock technically doesn't always return its argument
+  // verbatim, but it doesn't matter for our purposes here.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_RetainBlock ||
+         Class == IC_NoopCast;
+}
+
+/// \brief Test if the given class represents instructions which do nothing if
+/// passed a null pointer.
+static inline bool IsNoopOnNull(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_RetainBlock;
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the "tail" keyword.
+static inline bool IsAlwaysTail(InstructionClass Class) {
+  // IC_RetainBlock may be given a stack argument.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// \brief Test if the given class represents instructions which are never safe
+/// to mark with the "tail" keyword.
+static inline bool IsNeverTail(InstructionClass Class) {
+  /// It is never safe to tail call objc_autorelease since by tail calling
+  /// objc_autorelease, we also tail call -[NSObject autorelease] which supports
+  /// fast autoreleasing causing our object to be potentially reclaimed from the
+  /// autorelease pool which violates the semantics of __autoreleasing types in
+  /// ARC.
+  return Class == IC_Autorelease;
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the nounwind attribute.
+static inline bool IsNoThrow(InstructionClass Class) {
+  // objc_retainBlock is not nounwind because it calls user copy constructors
+  // which could theoretically throw.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_AutoreleasepoolPush ||
+         Class == IC_AutoreleasepoolPop;
+}
+
+/// Test whether the given instruction can autorelease any pointer or cause an
+/// autoreleasepool pop.
+static inline bool
+CanInterruptRV(InstructionClass Class) {
+  switch (Class) {
+  case IC_AutoreleasepoolPop:
+  case IC_CallOrUser:
+  case IC_Call:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// \brief Determine if F is one of the special known Functions.  If it isn't,
+/// return IC_CallOrUser.
+InstructionClass GetFunctionClass(const Function *F);
+
+/// \brief Determine which objc runtime call instruction class V belongs to.
+///
+/// This is similar to GetInstructionClass except that it only detects objc
+/// runtime calls. This allows it to be faster.
+///
+static inline InstructionClass GetBasicInstructionClass(const Value *V) {
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (const Function *F = CI->getCalledFunction())
+      return GetFunctionClass(F);
+    // Otherwise, be conservative.
+    return IC_CallOrUser;
+  }
+
+  // Otherwise, be conservative.
+  return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
+}
+
+/// \brief Determine what kind of construct V is.
+InstructionClass GetInstructionClass(const Value *V);
+
+/// \brief This is a wrapper around getUnderlyingObject which also knows how to
+/// look through objc_retain and objc_autorelease calls, which we know to return
+/// their argument verbatim.
+static inline const Value *GetUnderlyingObjCPtr(const Value *V) {
+  for (;;) {
+    V = GetUnderlyingObject(V);
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+
+  return V;
+}
+
+/// \brief This is a wrapper around Value::stripPointerCasts which also knows
+/// how to look through objc_retain and objc_autorelease calls, which we know to
+/// return their argument verbatim.
+static inline const Value *StripPointerCastsAndObjCCalls(const Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// \brief This is a wrapper around Value::stripPointerCasts which also knows
+/// how to look through objc_retain and objc_autorelease calls, which we know to
+/// return their argument verbatim.
+static inline Value *StripPointerCastsAndObjCCalls(Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// \brief Assuming the given instruction is one of the special calls such as
+/// objc_retain or objc_release, return the argument value, stripped of no-op
+/// casts and forwarding calls.
+static inline Value *GetObjCArg(Value *Inst) {
+  return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
+}
+
+static inline bool isNullOrUndef(const Value *V) {
+  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
+}
+
+static inline bool isNoopInstruction(const Instruction *I) {
+  return isa<BitCastInst>(I) ||
+    (isa<GetElementPtrInst>(I) &&
+     cast<GetElementPtrInst>(I)->hasAllZeroIndices());
+}
+
+
+/// \brief Erase the given instruction.
+///
+/// Many ObjC calls return their argument verbatim,
+/// so if it's such a call and the return value has users, replace them with the
+/// argument value.
+///
+static inline void EraseInstruction(Instruction *CI) {
+  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
+
+  bool Unused = CI->use_empty();
+
+  if (!Unused) {
+    // Replace the return value with the argument.
+    assert(IsForwarding(GetBasicInstructionClass(CI)) &&
+           "Can't delete non-forwarding instruction with users!");
+    CI->replaceAllUsesWith(OldArg);
+  }
+
+  CI->eraseFromParent();
+
+  if (Unused)
+    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
+}
+
+/// \brief Test whether the given value is possible a retainable object pointer.
+static inline bool IsPotentialRetainableObjPtr(const Value *Op) {
+  // Pointers to static or stack storage are not valid retainable object
+  // pointers.
+  if (isa<Constant>(Op) || isa<AllocaInst>(Op))
+    return false;
+  // Special arguments can not be a valid retainable object pointer.
+  if (const Argument *Arg = dyn_cast<Argument>(Op))
+    if (Arg->hasByValAttr() ||
+        Arg->hasNestAttr() ||
+        Arg->hasStructRetAttr())
+      return false;
+  // Only consider values with pointer types.
+  //
+  // It seemes intuitive to exclude function pointer types as well, since
+  // functions are never retainable object pointers, however clang occasionally
+  // bitcasts retainable object pointers to function-pointer type temporarily.
+  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
+  if (!Ty)
+    return false;
+  // Conservatively assume anything else is a potential retainable object
+  // pointer.
+  return true;
+}
+
+static inline bool IsPotentialRetainableObjPtr(const Value *Op,
+                                               AliasAnalysis &AA) {
+  // First make the rudimentary check.
+  if (!IsPotentialRetainableObjPtr(Op))
+    return false;
+
+  // Objects in constant memory are not reference-counted.
+  if (AA.pointsToConstantMemory(Op))
+    return false;
+
+  // Pointers in constant memory are not pointing to reference-counted objects.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
+    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
+      return false;
+
+  // Otherwise assume the worst.
+  return true;
+}
+
+/// \brief Helper for GetInstructionClass. Determines what kind of construct CS
+/// is.
+static inline InstructionClass GetCallSiteClass(ImmutableCallSite CS) {
+  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I)
+    if (IsPotentialRetainableObjPtr(*I))
+      return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser;
+
+  return CS.onlyReadsMemory() ? IC_None : IC_Call;
+}
+
+/// \brief Return true if this value refers to a distinct and identifiable
+/// object.
+///
+/// This is similar to AliasAnalysis's isIdentifiedObject, except that it uses
+/// special knowledge of ObjC conventions.
+static inline bool IsObjCIdentifiedObject(const Value *V) {
+  // Assume that call results and arguments have their own "provenance".
+  // Constants (including GlobalVariables) and Allocas are never
+  // reference-counted.
+  if (isa<CallInst>(V) || isa<InvokeInst>(V) ||
+      isa<Argument>(V) || isa<Constant>(V) ||
+      isa<AllocaInst>(V))
+    return true;
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    const Value *Pointer =
+      StripPointerCastsAndObjCCalls(LI->getPointerOperand());
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
+      // A constant pointer can't be pointing to an object on the heap. It may
+      // be reference-counted, but it won't be deleted.
+      if (GV->isConstant())
+        return true;
+      StringRef Name = GV->getName();
+      // These special variables are known to hold values which are not
+      // reference-counted pointers.
+      if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") ||
+          Name.startswith("\01L_OBJC_METH_VAR_NAME_") ||
+          Name.startswith("\01l_objc_msgSend_fixup_"))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_OBJCARC_H
diff --git a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
new file mode 100644
index 0000000..00d9864
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -0,0 +1,175 @@
+//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file implements optimizations which remove extraneous
+/// autorelease pools.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-ap-elim"
+#include "ObjCARC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+  /// \brief Autorelease pool elimination.
+  class ObjCARCAPElim : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnModule(Module &M);
+
+    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
+    static bool OptimizeBB(BasicBlock *BB);
+
+  public:
+    static char ID;
+    ObjCARCAPElim() : ModulePass(ID) {
+      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCAPElim::ID = 0;
+INITIALIZE_PASS(ObjCARCAPElim,
+                "objc-arc-apelim",
+                "ObjC ARC autorelease pool elimination",
+                false, false)
+
+Pass *llvm::createObjCARCAPElimPass() {
+  return new ObjCARCAPElim();
+}
+
+void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+/// Interprocedurally determine if calls made by the given call site can
+/// possibly produce autoreleases.
+bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
+  if (const Function *Callee = CS.getCalledFunction()) {
+    if (Callee->isDeclaration() || Callee->mayBeOverridden())
+      return true;
+    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
+         I != E; ++I) {
+      const BasicBlock *BB = I;
+      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
+           J != F; ++J)
+        if (ImmutableCallSite JCS = ImmutableCallSite(J))
+          // This recursion depth limit is arbitrary. It's just great
+          // enough to cover known interesting testcases.
+          if (Depth < 3 &&
+              !JCS.onlyReadsMemory() &&
+              MayAutorelease(JCS, Depth + 1))
+            return true;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
+  bool Changed = false;
+
+  Instruction *Push = 0;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+    Instruction *Inst = I++;
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPush:
+      Push = Inst;
+      break;
+    case IC_AutoreleasepoolPop:
+      // If this pop matches a push and nothing in between can autorelease,
+      // zap the pair.
+      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
+        Changed = true;
+        DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
+                        "autorelease pair:\n"
+                        "                           Pop: " << *Inst << "\n"
+                     << "                           Push: " << *Push << "\n");
+        Inst->eraseFromParent();
+        Push->eraseFromParent();
+      }
+      Push = 0;
+      break;
+    case IC_CallOrUser:
+      if (MayAutorelease(ImmutableCallSite(Inst)))
+        Push = 0;
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool ObjCARCAPElim::runOnModule(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!ModuleHasARC(M))
+    return false;
+
+  // Find the llvm.global_ctors variable, as the first step in
+  // identifying the global constructors. In theory, unnecessary autorelease
+  // pools could occur anywhere, but in practice it's pretty rare. Global
+  // ctors are a place where autorelease pools get inserted automatically,
+  // so it's pretty common for them to be unnecessary, and it's pretty
+  // profitable to eliminate them.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return false;
+
+  assert(GV->hasDefinitiveInitializer() &&
+         "llvm.global_ctors is uncooperative!");
+
+  bool Changed = false;
+
+  // Dig the constructor functions out of GV's initializer.
+  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    // llvm.global_ctors is an array of pairs where the second members
+    // are constructor functions.
+    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    // If the user used a constructor function with the wrong signature and
+    // it got bitcasted or whatever, look the other way.
+    if (!F)
+      continue;
+    // Only look at function definitions.
+    if (F->isDeclaration())
+      continue;
+    // Only look at functions with one basic block.
+    if (llvm::next(F->begin()) != F->end())
+      continue;
+    // Ok, a single-block constructor function definition. Try to optimize it.
+    Changed |= OptimizeBB(F->begin());
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
new file mode 100644
index 0000000..46b2de7
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
@@ -0,0 +1,162 @@
+//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -*- mode: c++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines a simple ARC-aware AliasAnalysis using special knowledge
+/// of Objective C to enhance other optimization passes which rely on the Alias
+/// Analysis infrastructure.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-aa"
+#include "ObjCARC.h"
+#include "ObjCARCAliasAnalysis.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassSupport.h"
+
+namespace llvm {
+  class Function;
+  class Value;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+// Register this pass...
+char ObjCARCAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa",
+                   "ObjC-ARC-Based Alias Analysis", false, true, false)
+
+ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
+  return new ObjCARCAliasAnalysis();
+}
+
+void
+ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AliasAnalysis::getAnalysisUsage(AU);
+}
+
+AliasAnalysis::AliasResult
+ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::alias(LocA, LocB);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making a
+  // precise alias query.
+  const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
+  const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
+  AliasResult Result =
+    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag),
+                         Location(SB, LocB.Size, LocB.TBAATag));
+  if (Result != MayAlias)
+    return Result;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *UA = GetUnderlyingObjCPtr(SA);
+  const Value *UB = GetUnderlyingObjCPtr(SB);
+  if (UA != SA || UB != SB) {
+    Result = AliasAnalysis::alias(Location(UA), Location(UB));
+    // We can't use MustAlias or PartialAlias results here because
+    // GetUnderlyingObjCPtr may return an offsetted pointer value.
+    if (Result == NoAlias)
+      return NoAlias;
+  }
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return MayAlias;
+}
+
+bool
+ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
+                                             bool OrLocal) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making
+  // a precise alias query.
+  const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
+  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag),
+                                            OrLocal))
+    return true;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *U = GetUnderlyingObjCPtr(S);
+  if (U != S)
+    return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal);
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return false;
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  // We have nothing to do. Just chain to the next AliasAnalysis.
+  return AliasAnalysis::getModRefBehavior(CS);
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefBehavior(F);
+
+  switch (GetFunctionClass(F)) {
+  case IC_NoopCast:
+    return DoesNotAccessMemory;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefBehavior(F);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefInfo(CS, Loc);
+
+  switch (GetBasicInstructionClass(CS.getInstruction())) {
+  case IC_Retain:
+  case IC_RetainRV:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_NoopCast:
+  case IC_AutoreleasepoolPush:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    // These functions don't access any memory visible to the compiler.
+    // Note that this doesn't include objc_retainBlock, because it updates
+    // pointers when it copies block data.
+    return NoModRef;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefInfo(CS, Loc);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
+                                    ImmutableCallSite CS2) {
+  // TODO: Theoretically we could check for dependencies between objc_* calls
+  // and OnlyAccessesArgumentPointees calls or other well-behaved calls.
+  return AliasAnalysis::getModRefInfo(CS1, CS2);
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
new file mode 100644
index 0000000..7abe995
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
@@ -0,0 +1,74 @@
+//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares a simple ARC-aware AliasAnalysis using special knowledge
+/// of Objective C to enhance other optimization passes which rely on the Alias
+/// Analysis infrastructure.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+namespace objcarc {
+
+  /// \brief This is a simple alias analysis implementation that uses knowledge
+  /// of ARC constructs to answer queries.
+  ///
+  /// TODO: This class could be generalized to know about other ObjC-specific
+  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing
+  /// even though their offsets are dynamic.
+  class ObjCARCAliasAnalysis : public ImmutablePass,
+                               public AliasAnalysis {
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    ObjCARCAliasAnalysis() : ImmutablePass(ID) {
+      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry());
+    }
+
+  private:
+    virtual void initializePass() {
+      InitializeAliasAnalysis(this);
+    }
+
+    /// This method is used when a pass implements an analysis interface through
+    /// multiple inheritance.  If needed, it should override this to adjust the
+    /// this pointer as needed for the specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *PI) {
+      if (PI == &AliasAnalysis::ID)
+        return static_cast<AliasAnalysis *>(this);
+      return this;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual AliasResult alias(const Location &LocA, const Location &LocB);
+    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
+    virtual ModRefBehavior getModRefBehavior(const Function *F);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Location &Loc);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2);
+  };
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
new file mode 100644
index 0000000..1c13d1c
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -0,0 +1,537 @@
+//===- ObjCARCContract.cpp - ObjC ARC Optimization ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines late ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file mainly deals with ``contracting'' multiple lower level
+/// operations into singular higher level operations through pattern matching.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: ObjCARCContract could insert PHI nodes when uses aren't
+// dominated by single calls.
+
+#define DEBUG_TYPE "objc-arc-contract"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
+
+namespace {
+  /// \brief Late ARC optimizations
+  ///
+  /// These change the IR in a way that makes it difficult to be analyzed by
+  /// ObjCARCOpt, so it's run late.
+  class ObjCARCContract : public FunctionPass {
+    bool Changed;
+    AliasAnalysis *AA;
+    DominatorTree *DT;
+    ProvenanceAnalysis PA;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// Declarations for ObjC runtime functions, for use in creating calls to
+    /// them. These are initialized lazily to avoid cluttering up the Module
+    /// with unused declarations.
+
+    /// Declaration for objc_storeStrong().
+    Constant *StoreStrongCallee;
+    /// Declaration for objc_retainAutorelease().
+    Constant *RetainAutoreleaseCallee;
+    /// Declaration for objc_retainAutoreleaseReturnValue().
+    Constant *RetainAutoreleaseRVCallee;
+
+    /// The inline asm string to insert between calls and RetainRV calls to make
+    /// the optimization work on targets which need it.
+    const MDString *RetainRVMarker;
+
+    /// The set of inserted objc_storeStrong calls. If at the end of walking the
+    /// function we have found no alloca instructions, these calls can be marked
+    /// "tail".
+    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
+
+    Constant *getStoreStrongCallee(Module *M);
+    Constant *getRetainAutoreleaseCallee(Module *M);
+    Constant *getRetainAutoreleaseRVCallee(Module *M);
+
+    bool ContractAutorelease(Function &F, Instruction *Autorelease,
+                             InstructionClass Class,
+                             SmallPtrSet<Instruction *, 4>
+                               &DependingInstructions,
+                             SmallPtrSet<const BasicBlock *, 4>
+                               &Visited);
+
+    void ContractRelease(Instruction *Release,
+                         inst_iterator &Iter);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+  public:
+    static char ID;
+    ObjCARCContract() : FunctionPass(ID) {
+      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCContract::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCContract,
+                      "objc-arc-contract", "ObjC ARC contraction", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(ObjCARCContract,
+                    "objc-arc-contract", "ObjC ARC contraction", false, false)
+
+Pass *llvm::createObjCARCContractPass() {
+  return new ObjCARCContract();
+}
+
+void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<DominatorTree>();
+  AU.setPreservesCFG();
+}
+
+Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
+  if (!StoreStrongCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *I8XX = PointerType::getUnqual(I8X);
+    Type *Params[] = { I8XX, I8X };
+
+    AttributeSet Attr = AttributeSet()
+      .addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                    Attribute::NoUnwind)
+      .addAttribute(M->getContext(), 1, Attribute::NoCapture);
+
+    StoreStrongCallee =
+      M->getOrInsertFunction(
+        "objc_storeStrong",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attr);
+  }
+  return StoreStrongCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
+  if (!RetainAutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainAutoreleaseCallee =
+      M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute);
+  }
+  return RetainAutoreleaseCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
+  if (!RetainAutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainAutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainAutoreleaseRVCallee;
+}
+
+/// Merge an autorelease with a retain into a fused call.
+bool
+ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
+                                     InstructionClass Class,
+                                     SmallPtrSet<Instruction *, 4>
+                                       &DependingInstructions,
+                                     SmallPtrSet<const BasicBlock *, 4>
+                                       &Visited) {
+  const Value *Arg = GetObjCArg(Autorelease);
+
+  // Check that there are no instructions between the retain and the autorelease
+  // (such as an autorelease_pop) which may change the count.
+  CallInst *Retain = 0;
+  if (Class == IC_AutoreleaseRV)
+    FindDependencies(RetainAutoreleaseRVDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+  else
+    FindDependencies(RetainAutoreleaseDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+
+  Visited.clear();
+  if (DependingInstructions.size() != 1) {
+    DependingInstructions.clear();
+    return false;
+  }
+
+  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+  DependingInstructions.clear();
+
+  if (!Retain ||
+      GetBasicInstructionClass(Retain) != IC_Retain ||
+      GetObjCArg(Retain) != Arg)
+    return false;
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing "
+                  "retain/autorelease. Erasing: " << *Autorelease << "\n"
+                  "                                      Old Retain: "
+               << *Retain << "\n");
+
+  if (Class == IC_AutoreleaseRV)
+    Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent()));
+  else
+    Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                      New Retain: "
+               << *Retain << "\n");
+
+  EraseInstruction(Autorelease);
+  return true;
+}
+
+/// Attempt to merge an objc_release with a store, load, and objc_retain to form
+/// an objc_storeStrong. This can be a little tricky because the instructions
+/// don't always appear in order, and there may be unrelated intervening
+/// instructions.
+void ObjCARCContract::ContractRelease(Instruction *Release,
+                                      inst_iterator &Iter) {
+  LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
+  if (!Load || !Load->isSimple()) return;
+
+  // For now, require everything to be in one basic block.
+  BasicBlock *BB = Release->getParent();
+  if (Load->getParent() != BB) return;
+
+  // Walk down to find the store and the release, which may be in either order.
+  BasicBlock::iterator I = Load, End = BB->end();
+  ++I;
+  AliasAnalysis::Location Loc = AA->getLocation(Load);
+  StoreInst *Store = 0;
+  bool SawRelease = false;
+  for (; !Store || !SawRelease; ++I) {
+    if (I == End)
+      return;
+
+    Instruction *Inst = I;
+    if (Inst == Release) {
+      SawRelease = true;
+      continue;
+    }
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    // Unrelated retains are harmless.
+    if (IsRetain(Class))
+      continue;
+
+    if (Store) {
+      // The store is the point where we're going to put the objc_storeStrong,
+      // so make sure there are no uses after it.
+      if (CanUse(Inst, Load, PA, Class))
+        return;
+    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
+      // We are moving the load down to the store, so check for anything
+      // else which writes to the memory between the load and the store.
+      Store = dyn_cast<StoreInst>(Inst);
+      if (!Store || !Store->isSimple()) return;
+      if (Store->getPointerOperand() != Loc.Ptr) return;
+    }
+  }
+
+  Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
+
+  // Walk up to find the retain.
+  I = Store;
+  BasicBlock::iterator Begin = BB->begin();
+  while (I != Begin && GetBasicInstructionClass(I) != IC_Retain)
+    --I;
+  Instruction *Retain = I;
+  if (GetBasicInstructionClass(Retain) != IC_Retain) return;
+  if (GetObjCArg(Retain) != New) return;
+
+  Changed = true;
+  ++NumStoreStrongs;
+
+  LLVMContext &C = Release->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+
+  Value *Args[] = { Load->getPointerOperand(), New };
+  if (Args[0]->getType() != I8XX)
+    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
+  if (Args[1]->getType() != I8X)
+    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
+  CallInst *StoreStrong =
+    CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()),
+                     Args, "", Store);
+  StoreStrong->setDoesNotThrow();
+  StoreStrong->setDebugLoc(Store->getDebugLoc());
+
+  // We can't set the tail flag yet, because we haven't yet determined
+  // whether there are any escaping allocas. Remember this call, so that
+  // we can set the tail flag once we know it's safe.
+  StoreStrongCalls.insert(StoreStrong);
+
+  if (&*Iter == Store) ++Iter;
+  Store->eraseFromParent();
+  Release->eraseFromParent();
+  EraseInstruction(Retain);
+  if (Load->use_empty())
+    Load->eraseFromParent();
+}
+
+bool ObjCARCContract::doInitialization(Module &M) {
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  // These are initialized lazily.
+  StoreStrongCallee = 0;
+  RetainAutoreleaseCallee = 0;
+  RetainAutoreleaseRVCallee = 0;
+
+  // Initialize RetainRVMarker.
+  RetainRVMarker = 0;
+  if (NamedMDNode *NMD =
+        M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
+    if (NMD->getNumOperands() == 1) {
+      const MDNode *N = NMD->getOperand(0);
+      if (N->getNumOperands() == 1)
+        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
+          RetainRVMarker = S;
+    }
+
+  return false;
+}
+
+bool ObjCARCContract::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
+  // keyword. Be conservative if the function has variadic arguments.
+  // It seems that functions which "return twice" are also unsafe for the
+  // "tail" argument, because they are setjmp, which could need to
+  // return to an earlier stack state.
+  bool TailOkForStoreStrongs = !F.isVarArg() &&
+                               !F.callsFunctionThatReturnsTwice();
+
+  // For ObjC library calls which return their argument, replace uses of the
+  // argument with uses of the call return value, if it dominates the use. This
+  // reduces register pressure.
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n");
+
+    // Only these library routines return their argument. In particular,
+    // objc_retainBlock does not necessarily return its argument.
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV:
+      break;
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+      if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
+        continue;
+      break;
+    case IC_RetainRV: {
+      // If we're compiling for a target which needs a special inline-asm
+      // marker to do the retainAutoreleasedReturnValue optimization,
+      // insert it now.
+      if (!RetainRVMarker)
+        break;
+      BasicBlock::iterator BBI = Inst;
+      BasicBlock *InstParent = Inst->getParent();
+
+      // Step up to see if the call immediately precedes the RetainRV call.
+      // If it's an invoke, we have to cross a block boundary. And we have
+      // to carefully dodge no-op instructions.
+      do {
+        if (&*BBI == InstParent->begin()) {
+          BasicBlock *Pred = InstParent->getSinglePredecessor();
+          if (!Pred)
+            goto decline_rv_optimization;
+          BBI = Pred->getTerminator();
+          break;
+        }
+        --BBI;
+      } while (isNoopInstruction(BBI));
+
+      if (&*BBI == GetObjCArg(Inst)) {
+        DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for "
+                        "retainAutoreleasedReturnValue optimization.\n");
+        Changed = true;
+        InlineAsm *IA =
+          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
+                                           /*isVarArg=*/false),
+                         RetainRVMarker->getString(),
+                         /*Constraints=*/"", /*hasSideEffects=*/true);
+        CallInst::Create(IA, "", Inst);
+      }
+    decline_rv_optimization:
+      break;
+    }
+    case IC_InitWeak: {
+      // objc_initWeak(p, null) => *p = null
+      CallInst *CI = cast<CallInst>(Inst);
+      if (isNullOrUndef(CI->getArgOperand(1))) {
+        Value *Null =
+          ConstantPointerNull::get(cast<PointerType>(CI->getType()));
+        Changed = true;
+        new StoreInst(Null, CI->getArgOperand(0), CI);
+
+        DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
+                     << "                 New = " << *Null << "\n");
+
+        CI->replaceAllUsesWith(Null);
+        CI->eraseFromParent();
+      }
+      continue;
+    }
+    case IC_Release:
+      ContractRelease(Inst, I);
+      continue;
+    case IC_User:
+      // Be conservative if the function has any alloca instructions.
+      // Technically we only care about escaping alloca instructions,
+      // but this is sufficient to handle some interesting cases.
+      if (isa<AllocaInst>(Inst))
+        TailOkForStoreStrongs = false;
+      continue;
+    default:
+      continue;
+    }
+
+    DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n");
+
+    // Don't use GetObjCArg because we don't want to look through bitcasts
+    // and such; to do the replacement, the argument must have type i8*.
+    const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+    for (;;) {
+      // If we're compiling bugpointed code, don't get in trouble.
+      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
+        break;
+      // Look through the uses of the pointer.
+      for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+           UI != UE; ) {
+        Use &U = UI.getUse();
+        unsigned OperandNo = UI.getOperandNo();
+        ++UI; // Increment UI now, because we may unlink its element.
+
+        // If the call's return value dominates a use of the call's argument
+        // value, rewrite the use to use the return value. We check for
+        // reachability here because an unreachable call is considered to
+        // trivially dominate itself, which would lead us to rewriting its
+        // argument in terms of its return value, which would lead to
+        // infinite loops in GetObjCArg.
+        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
+          Changed = true;
+          Instruction *Replacement = Inst;
+          Type *UseTy = U.get()->getType();
+          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+            // For PHI nodes, insert the bitcast in the predecessor block.
+            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            &BB->back());
+            // While we're here, rewrite all edges for this PHI, rather
+            // than just one use at a time, to minimize the number of
+            // bitcasts we emit.
+            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+              if (PHI->getIncomingBlock(i) == BB) {
+                // Keep the UI iterator valid.
+                if (&PHI->getOperandUse(
+                      PHINode::getOperandNumForIncomingValue(i)) ==
+                    &UI.getUse())
+                  ++UI;
+                PHI->setIncomingValue(i, Replacement);
+              }
+          } else {
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            cast<Instruction>(U.getUser()));
+            U.set(Replacement);
+          }
+        }
+      }
+
+      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
+      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
+        Arg = BI->getOperand(0);
+      else if (isa<GEPOperator>(Arg) &&
+               cast<GEPOperator>(Arg)->hasAllZeroIndices())
+        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
+      else if (isa<GlobalAlias>(Arg) &&
+               !cast<GlobalAlias>(Arg)->mayBeOverridden())
+        Arg = cast<GlobalAlias>(Arg)->getAliasee();
+      else
+        break;
+    }
+  }
+
+  // If this function has no escaping allocas or suspicious vararg usage,
+  // objc_storeStrong calls can be marked with the "tail" keyword.
+  if (TailOkForStoreStrongs)
+    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
+         E = StoreStrongCalls.end(); I != E; ++I)
+      (*I)->setTailCall();
+  StoreStrongCalls.clear();
+
+  return Changed;
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
new file mode 100644
index 0000000..39bf8f3
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -0,0 +1,128 @@
+//===- ObjCARCExpand.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file deals with early optimizations which perform certain
+/// cleanup operations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-expand"
+
+#include "ObjCARC.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class Module;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+  /// \brief Early ARC transformations.
+  class ObjCARCExpand : public FunctionPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+  public:
+    static char ID;
+    ObjCARCExpand() : FunctionPass(ID) {
+      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCExpand::ID = 0;
+INITIALIZE_PASS(ObjCARCExpand,
+                "objc-arc-expand", "ObjC ARC expansion", false, false)
+
+Pass *llvm::createObjCARCExpandPass() {
+  return new ObjCARCExpand();
+}
+
+void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCExpand::doInitialization(Module &M) {
+  Run = ModuleHasARC(M);
+  return false;
+}
+
+bool ObjCARCExpand::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  bool Changed = false;
+
+  DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName() << "\n");
+
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    Instruction *Inst = &*I;
+
+    DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
+
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_Retain:
+    case IC_RetainRV:
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV: {
+      // These calls return their argument verbatim, as a low-level
+      // optimization. However, this makes high-level optimizations
+      // harder. Undo any uses of this optimization that the front-end
+      // emitted here. We'll redo them in the contract pass.
+      Changed = true;
+      Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
+      DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n"
+                      "               New = " << *Value << "\n");
+      Inst->replaceAllUsesWith(Value);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
+
+  return Changed;
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
new file mode 100644
index 0000000..9c14949
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -0,0 +1,2691 @@
+//===- ObjCARCOpts.cpp - ObjC ARC Optimization ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// The optimizations performed include elimination of redundant, partially
+/// redundant, and inconsequential reference count operations, elimination of
+/// redundant weak pointer operations, and numerous minor simplifications.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-opts"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARCAliasAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// \defgroup MiscUtils Miscellaneous utilities that are not ARC specific.
+/// @{
+
+namespace {
+  /// \brief An associative container with fast insertion-order (deterministic)
+  /// iteration over its elements. Plus the special blot operation.
+  template<class KeyT, class ValueT>
+  class MapVector {
+    /// Map keys to indices in Vector.
+    typedef DenseMap<KeyT, size_t> MapTy;
+    MapTy Map;
+
+    typedef std::vector<std::pair<KeyT, ValueT> > VectorTy;
+    /// Keys and values.
+    VectorTy Vector;
+
+  public:
+    typedef typename VectorTy::iterator iterator;
+    typedef typename VectorTy::const_iterator const_iterator;
+    iterator begin() { return Vector.begin(); }
+    iterator end() { return Vector.end(); }
+    const_iterator begin() const { return Vector.begin(); }
+    const_iterator end() const { return Vector.end(); }
+
+#ifdef XDEBUG
+    ~MapVector() {
+      assert(Vector.size() >= Map.size()); // May differ due to blotting.
+      for (typename MapTy::const_iterator I = Map.begin(), E = Map.end();
+           I != E; ++I) {
+        assert(I->second < Vector.size());
+        assert(Vector[I->second].first == I->first);
+      }
+      for (typename VectorTy::const_iterator I = Vector.begin(),
+           E = Vector.end(); I != E; ++I)
+        assert(!I->first ||
+               (Map.count(I->first) &&
+                Map[I->first] == size_t(I - Vector.begin())));
+    }
+#endif
+
+    ValueT &operator[](const KeyT &Arg) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(Arg, size_t(0)));
+      if (Pair.second) {
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
+        Vector.push_back(std::make_pair(Arg, ValueT()));
+        return Vector[Num].second;
+      }
+      return Vector[Pair.first->second].second;
+    }
+
+    std::pair<iterator, bool>
+    insert(const std::pair<KeyT, ValueT> &InsertPair) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
+      if (Pair.second) {
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
+        Vector.push_back(InsertPair);
+        return std::make_pair(Vector.begin() + Num, true);
+      }
+      return std::make_pair(Vector.begin() + Pair.first->second, false);
+    }
+
+    const_iterator find(const KeyT &Key) const {
+      typename MapTy::const_iterator It = Map.find(Key);
+      if (It == Map.end()) return Vector.end();
+      return Vector.begin() + It->second;
+    }
+
+    /// This is similar to erase, but instead of removing the element from the
+    /// vector, it just zeros out the key in the vector. This leaves iterators
+    /// intact, but clients must be prepared for zeroed-out keys when iterating.
+    void blot(const KeyT &Key) {
+      typename MapTy::iterator It = Map.find(Key);
+      if (It == Map.end()) return;
+      Vector[It->second].first = KeyT();
+      Map.erase(It);
+    }
+
+    void clear() {
+      Map.clear();
+      Vector.clear();
+    }
+  };
+}
+
+/// @}
+///
+/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
+/// @{
+
+/// \brief This is similar to StripPointerCastsAndObjCCalls but it stops as soon
+/// as it finds a value with multiple uses.
+static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
+  if (Arg->hasOneUse()) {
+    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
+      return FindSingleUseIdentifiedObject(BC->getOperand(0));
+    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
+      if (GEP->hasAllZeroIndices())
+        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
+    if (IsForwarding(GetBasicInstructionClass(Arg)))
+      return FindSingleUseIdentifiedObject(
+               cast<CallInst>(Arg)->getArgOperand(0));
+    if (!IsObjCIdentifiedObject(Arg))
+      return 0;
+    return Arg;
+  }
+
+  // If we found an identifiable object but it has multiple uses, but they are
+  // trivial uses, we can still consider this to be a single-use value.
+  if (IsObjCIdentifiedObject(Arg)) {
+    for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+         UI != UE; ++UI) {
+      const User *U = *UI;
+      if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
+         return 0;
+    }
+
+    return Arg;
+  }
+
+  return 0;
+}
+
+/// \brief Test whether the given retainable object pointer escapes.
+///
+/// This differs from regular escape analysis in that a use as an
+/// argument to a call is not considered an escape.
+///
+static bool DoesRetainableObjPtrEscape(const User *Ptr) {
+  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Target: " << *Ptr << "\n");
+
+  // Walk the def-use chains.
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(Ptr);
+  // If Ptr has any operands add them as well.
+  for (User::const_op_iterator I = Ptr->op_begin(), E = Ptr->op_end(); I != E;
+       ++I) {
+    Worklist.push_back(*I);
+  }
+
+  // Ensure we do not visit any value twice.
+  SmallPtrSet<const Value *, 8> VisitedSet;
+
+  do {
+    const Value *V = Worklist.pop_back_val();
+
+    DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Visiting: " << *V << "\n");
+
+    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      const User *UUser = *UI;
+
+      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User: " << *UUser << "\n");
+
+      // Special - Use by a call (callee or argument) is not considered
+      // to be an escape.
+      switch (GetBasicInstructionClass(UUser)) {
+      case IC_StoreWeak:
+      case IC_InitWeak:
+      case IC_StoreStrong:
+      case IC_Autorelease:
+      case IC_AutoreleaseRV: {
+        DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies pointer "
+              "arguments. Pointer Escapes!\n");
+        // These special functions make copies of their pointer arguments.
+        return true;
+      }
+      case IC_User:
+      case IC_None:
+        // Use by an instruction which copies the value is an escape if the
+        // result is an escape.
+        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
+            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
+
+          if (VisitedSet.insert(UUser)) {
+            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies value. "
+                  "Ptr escapes if result escapes. Adding to list.\n");
+            Worklist.push_back(UUser);
+          } else {
+            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Already visited node."
+                  "\n");
+          }
+          continue;
+        }
+        // Use by a load is not an escape.
+        if (isa<LoadInst>(UUser))
+          continue;
+        // Use by a store is not an escape if the use is the address.
+        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
+          if (V != SI->getValueOperand())
+            continue;
+        break;
+      default:
+        // Regular calls and other stuff are not considered escapes.
+        continue;
+      }
+      // Otherwise, conservatively assume an escape.
+      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Assuming ptr escapes.\n");
+      return true;
+    }
+  } while (!Worklist.empty());
+
+  // No escapes found.
+  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Ptr does not escape.\n");
+  return false;
+}
+
+/// @}
+///
+/// \defgroup ARCOpt ARC Optimization.
+/// @{
+
+// TODO: On code like this:
+//
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+// stuff_that_cannot_release()
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+//
+// The second retain and autorelease can be deleted.
+
+// TODO: It should be possible to delete
+// objc_autoreleasePoolPush and objc_autoreleasePoolPop
+// pairs if nothing is actually autoreleased between them. Also, autorelease
+// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
+// after inlining) can be turned into plain release calls.
+
+// TODO: Critical-edge splitting. If the optimial insertion point is
+// a critical edge, the current algorithm has to fail, because it doesn't
+// know how to split edges. It should be possible to make the optimizer
+// think in terms of edges, rather than blocks, and then split critical
+// edges on demand.
+
+// TODO: OptimizeSequences could generalized to be Interprocedural.
+
+// TODO: Recognize that a bunch of other objc runtime calls have
+// non-escaping arguments and non-releasing arguments, and may be
+// non-autoreleasing.
+
+// TODO: Sink autorelease calls as far as possible. Unfortunately we
+// usually can't sink them past other calls, which would be the main
+// case where it would be useful.
+
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
+
+STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
+STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
+STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
+STATISTIC(NumRets,        "Number of return value forwarding "
+                          "retain+autoreleaes eliminated");
+STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+
+namespace {
+  /// \enum Sequence
+  ///
+  /// \brief A sequence of states that a pointer may go through in which an
+  /// objc_retain and objc_release are actually needed.
+  enum Sequence {
+    S_None,
+    S_Retain,         ///< objc_retain(x).
+    S_CanRelease,     ///< foo(x) -- x could possibly see a ref count decrement.
+    S_Use,            ///< any use of x.
+    S_Stop,           ///< like S_Release, but code motion is stopped.
+    S_Release,        ///< objc_release(x).
+    S_MovableRelease  ///< objc_release(x), !clang.imprecise_release.
+  };
+
+  raw_ostream &operator<<(raw_ostream &OS, const Sequence S)
+    LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<<(raw_ostream &OS, const Sequence S) {
+    switch (S) {
+    case S_None:
+      return OS << "S_None";
+    case S_Retain:
+      return OS << "S_Retain";
+    case S_CanRelease:
+      return OS << "S_CanRelease";
+    case S_Use:
+      return OS << "S_Use";
+    case S_Release:
+      return OS << "S_Release";
+    case S_MovableRelease:
+      return OS << "S_MovableRelease";
+    case S_Stop:
+      return OS << "S_Stop";
+    }
+    llvm_unreachable("Unknown sequence type.");
+  }
+}
+
+static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
+  // The easy cases.
+  if (A == B)
+    return A;
+  if (A == S_None || B == S_None)
+    return S_None;
+
+  if (A > B) std::swap(A, B);
+  if (TopDown) {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
+      return B;
+  } else {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Use || A == S_CanRelease) &&
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
+      return A;
+    // If both sides are releases, choose the more conservative one.
+    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
+      return A;
+    if (A == S_Release && B == S_MovableRelease)
+      return A;
+  }
+
+  return S_None;
+}
+
+namespace {
+  /// \brief Unidirectional information about either a
+  /// retain-decrement-use-release sequence or release-use-decrement-retain
+  /// reverese sequence.
+  struct RRInfo {
+    /// After an objc_retain, the reference count of the referenced
+    /// object is known to be positive. Similarly, before an objc_release, the
+    /// reference count of the referenced object is known to be positive. If
+    /// there are retain-release pairs in code regions where the retain count
+    /// is known to be positive, they can be eliminated, regardless of any side
+    /// effects between them.
+    ///
+    /// Also, a retain+release pair nested within another retain+release
+    /// pair all on the known same pointer value can be eliminated, regardless
+    /// of any intervening side effects.
+    ///
+    /// KnownSafe is true when either of these conditions is satisfied.
+    bool KnownSafe;
+
+    /// True if the Calls are objc_retainBlock calls (as opposed to objc_retain
+    /// calls).
+    bool IsRetainBlock;
+
+    /// True of the objc_release calls are all marked with the "tail" keyword.
+    bool IsTailCallRelease;
+
+    /// If the Calls are objc_release calls and they all have a
+    /// clang.imprecise_release tag, this is the metadata tag.
+    MDNode *ReleaseMetadata;
+
+    /// For a top-down sequence, the set of objc_retains or
+    /// objc_retainBlocks. For bottom-up, the set of objc_releases.
+    SmallPtrSet<Instruction *, 2> Calls;
+
+    /// The set of optimal insert positions for moving calls in the opposite
+    /// sequence.
+    SmallPtrSet<Instruction *, 2> ReverseInsertPts;
+
+    RRInfo() :
+      KnownSafe(false), IsRetainBlock(false),
+      IsTailCallRelease(false),
+      ReleaseMetadata(0) {}
+
+    void clear();
+  };
+}
+
+void RRInfo::clear() {
+  KnownSafe = false;
+  IsRetainBlock = false;
+  IsTailCallRelease = false;
+  ReleaseMetadata = 0;
+  Calls.clear();
+  ReverseInsertPts.clear();
+}
+
+namespace {
+  /// \brief This class summarizes several per-pointer runtime properties which
+  /// are propogated through the flow graph.
+  class PtrState {
+    /// True if the reference count is known to be incremented.
+    bool KnownPositiveRefCount;
+
+    /// True of we've seen an opportunity for partial RR elimination, such as
+    /// pushing calls into a CFG triangle or into one side of a CFG diamond.
+    bool Partial;
+
+    /// The current position in the sequence.
+    Sequence Seq : 8;
+
+  public:
+    /// Unidirectional information about the current sequence.
+    ///
+    /// TODO: Encapsulate this better.
+    RRInfo RRI;
+
+    PtrState() : KnownPositiveRefCount(false), Partial(false),
+                 Seq(S_None) {}
+
+    void SetKnownPositiveRefCount() {
+      KnownPositiveRefCount = true;
+    }
+
+    void ClearRefCount() {
+      KnownPositiveRefCount = false;
+    }
+
+    bool IsKnownIncremented() const {
+      return KnownPositiveRefCount;
+    }
+
+    void SetSeq(Sequence NewSeq) {
+      Seq = NewSeq;
+    }
+
+    Sequence GetSeq() const {
+      return Seq;
+    }
+
+    void ClearSequenceProgress() {
+      ResetSequenceProgress(S_None);
+    }
+
+    void ResetSequenceProgress(Sequence NewSeq) {
+      Seq = NewSeq;
+      Partial = false;
+      RRI.clear();
+    }
+
+    void Merge(const PtrState &Other, bool TopDown);
+  };
+}
+
+void
+PtrState::Merge(const PtrState &Other, bool TopDown) {
+  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
+  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
+
+  // We can't merge a plain objc_retain with an objc_retainBlock.
+  if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
+    Seq = S_None;
+
+  // If we're not in a sequence (anymore), drop all associated state.
+  if (Seq == S_None) {
+    Partial = false;
+    RRI.clear();
+  } else if (Partial || Other.Partial) {
+    // If we're doing a merge on a path that's previously seen a partial
+    // merge, conservatively drop the sequence, to avoid doing partial
+    // RR elimination. If the branch predicates for the two merge differ,
+    // mixing them is unsafe.
+    ClearSequenceProgress();
+  } else {
+    // Conservatively merge the ReleaseMetadata information.
+    if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
+      RRI.ReleaseMetadata = 0;
+
+    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
+    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
+                            Other.RRI.IsTailCallRelease;
+    RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
+
+    // Merge the insert point sets. If there are any differences,
+    // that makes this a partial merge.
+    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
+    for (SmallPtrSet<Instruction *, 2>::const_iterator
+         I = Other.RRI.ReverseInsertPts.begin(),
+         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
+      Partial |= RRI.ReverseInsertPts.insert(*I);
+  }
+}
+
+namespace {
+  /// \brief Per-BasicBlock state.
+  class BBState {
+    /// The number of unique control paths from the entry which can reach this
+    /// block.
+    unsigned TopDownPathCount;
+
+    /// The number of unique control paths to exits from this block.
+    unsigned BottomUpPathCount;
+
+    /// A type for PerPtrTopDown and PerPtrBottomUp.
+    typedef MapVector<const Value *, PtrState> MapTy;
+
+    /// The top-down traversal uses this to record information known about a
+    /// pointer at the bottom of each block.
+    MapTy PerPtrTopDown;
+
+    /// The bottom-up traversal uses this to record information known about a
+    /// pointer at the top of each block.
+    MapTy PerPtrBottomUp;
+
+    /// Effective predecessors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Preds;
+    /// Effective successors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Succs;
+
+  public:
+    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
+
+    typedef MapTy::iterator ptr_iterator;
+    typedef MapTy::const_iterator ptr_const_iterator;
+
+    ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
+    ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
+    ptr_const_iterator top_down_ptr_begin() const {
+      return PerPtrTopDown.begin();
+    }
+    ptr_const_iterator top_down_ptr_end() const {
+      return PerPtrTopDown.end();
+    }
+
+    ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); }
+    ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
+    ptr_const_iterator bottom_up_ptr_begin() const {
+      return PerPtrBottomUp.begin();
+    }
+    ptr_const_iterator bottom_up_ptr_end() const {
+      return PerPtrBottomUp.end();
+    }
+
+    /// Mark this block as being an entry block, which has one path from the
+    /// entry by definition.
+    void SetAsEntry() { TopDownPathCount = 1; }
+
+    /// Mark this block as being an exit block, which has one path to an exit by
+    /// definition.
+    void SetAsExit()  { BottomUpPathCount = 1; }
+
+    PtrState &getPtrTopDownState(const Value *Arg) {
+      return PerPtrTopDown[Arg];
+    }
+
+    PtrState &getPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp[Arg];
+    }
+
+    void clearBottomUpPointers() {
+      PerPtrBottomUp.clear();
+    }
+
+    void clearTopDownPointers() {
+      PerPtrTopDown.clear();
+    }
+
+    void InitFromPred(const BBState &Other);
+    void InitFromSucc(const BBState &Other);
+    void MergePred(const BBState &Other);
+    void MergeSucc(const BBState &Other);
+
+    /// Return the number of possible unique paths from an entry to an exit
+    /// which pass through this block. This is only valid after both the
+    /// top-down and bottom-up traversals are complete.
+    unsigned GetAllPathCount() const {
+      assert(TopDownPathCount != 0);
+      assert(BottomUpPathCount != 0);
+      return TopDownPathCount * BottomUpPathCount;
+    }
+
+    // Specialized CFG utilities.
+    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
+    edge_iterator pred_begin() { return Preds.begin(); }
+    edge_iterator pred_end() { return Preds.end(); }
+    edge_iterator succ_begin() { return Succs.begin(); }
+    edge_iterator succ_end() { return Succs.end(); }
+
+    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
+    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
+
+    bool isExit() const { return Succs.empty(); }
+  };
+}
+
+void BBState::InitFromPred(const BBState &Other) {
+  PerPtrTopDown = Other.PerPtrTopDown;
+  TopDownPathCount = Other.TopDownPathCount;
+}
+
+void BBState::InitFromSucc(const BBState &Other) {
+  PerPtrBottomUp = Other.PerPtrBottomUp;
+  BottomUpPathCount = Other.BottomUpPathCount;
+}
+
+/// The top-down traversal uses this to merge information about predecessors to
+/// form the initial state for a new block.
+void BBState::MergePred(const BBState &Other) {
+  // Other.TopDownPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  TopDownPathCount += Other.TopDownPathCount;
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (TopDownPathCount < Other.TopDownPathCount) {
+    clearTopDownPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the same key,
+  // merge the entries. Otherwise, copy the entry and merge it with an empty
+  // entry.
+  for (ptr_const_iterator MI = Other.top_down_ptr_begin(),
+       ME = Other.top_down_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/true);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry with the
+  // same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = top_down_ptr_begin(),
+       ME = top_down_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/true);
+}
+
+/// The bottom-up traversal uses this to merge information about successors to
+/// form the initial state for a new block.
+void BBState::MergeSucc(const BBState &Other) {
+  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  BottomUpPathCount += Other.BottomUpPathCount;
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (BottomUpPathCount < Other.BottomUpPathCount) {
+    clearBottomUpPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the
+  // same key, merge the entries. Otherwise, copy the entry and merge
+  // it with an empty entry.
+  for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(),
+       ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/false);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry
+  // with the same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = bottom_up_ptr_begin(),
+       ME = bottom_up_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/false);
+}
+
+namespace {
+  /// \brief The main ARC optimization pass.
+  class ObjCARCOpt : public FunctionPass {
+    bool Changed;
+    ProvenanceAnalysis PA;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// Declarations for ObjC runtime functions, for use in creating calls to
+    /// them. These are initialized lazily to avoid cluttering up the Module
+    /// with unused declarations.
+
+    /// Declaration for ObjC runtime function
+    /// objc_retainAutoreleasedReturnValue.
+    Constant *RetainRVCallee;
+    /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
+    Constant *AutoreleaseRVCallee;
+    /// Declaration for ObjC runtime function objc_release.
+    Constant *ReleaseCallee;
+    /// Declaration for ObjC runtime function objc_retain.
+    Constant *RetainCallee;
+    /// Declaration for ObjC runtime function objc_retainBlock.
+    Constant *RetainBlockCallee;
+    /// Declaration for ObjC runtime function objc_autorelease.
+    Constant *AutoreleaseCallee;
+
+    /// Flags which determine whether each of the interesting runtine functions
+    /// is in fact used in the current function.
+    unsigned UsedInThisFunction;
+
+    /// The Metadata Kind for clang.imprecise_release metadata.
+    unsigned ImpreciseReleaseMDKind;
+
+    /// The Metadata Kind for clang.arc.copy_on_escape metadata.
+    unsigned CopyOnEscapeMDKind;
+
+    /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata.
+    unsigned NoObjCARCExceptionsMDKind;
+
+    Constant *getRetainRVCallee(Module *M);
+    Constant *getAutoreleaseRVCallee(Module *M);
+    Constant *getReleaseCallee(Module *M);
+    Constant *getRetainCallee(Module *M);
+    Constant *getRetainBlockCallee(Module *M);
+    Constant *getAutoreleaseCallee(Module *M);
+
+    bool IsRetainBlockOptimizable(const Instruction *Inst);
+
+    void OptimizeRetainCall(Function &F, Instruction *Retain);
+    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
+    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                   InstructionClass &Class);
+    void OptimizeIndividualCalls(Function &F);
+
+    void CheckForCFGHazards(const BasicBlock *BB,
+                            DenseMap<const BasicBlock *, BBState> &BBStates,
+                            BBState &MyStates) const;
+    bool VisitInstructionBottomUp(Instruction *Inst,
+                                  BasicBlock *BB,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  BBState &MyStates);
+    bool VisitBottomUp(BasicBlock *BB,
+                       DenseMap<const BasicBlock *, BBState> &BBStates,
+                       MapVector<Value *, RRInfo> &Retains);
+    bool VisitInstructionTopDown(Instruction *Inst,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 BBState &MyStates);
+    bool VisitTopDown(BasicBlock *BB,
+                      DenseMap<const BasicBlock *, BBState> &BBStates,
+                      DenseMap<Value *, RRInfo> &Releases);
+    bool Visit(Function &F,
+               DenseMap<const BasicBlock *, BBState> &BBStates,
+               MapVector<Value *, RRInfo> &Retains,
+               DenseMap<Value *, RRInfo> &Releases);
+
+    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                   MapVector<Value *, RRInfo> &Retains,
+                   DenseMap<Value *, RRInfo> &Releases,
+                   SmallVectorImpl<Instruction *> &DeadInsts,
+                   Module *M);
+
+    bool ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> &BBStates,
+                               MapVector<Value *, RRInfo> &Retains,
+                               DenseMap<Value *, RRInfo> &Releases,
+                               Module *M,
+                               SmallVector<Instruction *, 4> &NewRetains,
+                               SmallVector<Instruction *, 4> &NewReleases,
+                               SmallVector<Instruction *, 8> &DeadInsts,
+                               RRInfo &RetainsToMove,
+                               RRInfo &ReleasesToMove,
+                               Value *Arg,
+                               bool KnownSafe,
+                               bool &AnyPairsCompletelyEliminated);
+
+    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
+                              MapVector<Value *, RRInfo> &Retains,
+                              DenseMap<Value *, RRInfo> &Releases,
+                              Module *M);
+
+    void OptimizeWeakCalls(Function &F);
+
+    bool OptimizeSequences(Function &F);
+
+    void OptimizeReturns(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+    virtual void releaseMemory();
+
+  public:
+    static char ID;
+    ObjCARCOpt() : FunctionPass(ID) {
+      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCOpt::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCOpt,
+                      "objc-arc", "ObjC ARC optimization", false, false)
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis)
+INITIALIZE_PASS_END(ObjCARCOpt,
+                    "objc-arc", "ObjC ARC optimization", false, false)
+
+Pass *llvm::createObjCARCOptPass() {
+  return new ObjCARCOpt();
+}
+
+void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<ObjCARCAliasAnalysis>();
+  AU.addRequired<AliasAnalysis>();
+  // ARC optimization doesn't currently split critical edges.
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
+  // Without the magic metadata tag, we have to assume this might be an
+  // objc_retainBlock call inserted to convert a block pointer to an id,
+  // in which case it really is needed.
+  if (!Inst->getMetadata(CopyOnEscapeMDKind))
+    return false;
+
+  // If the pointer "escapes" (not including being used in a call),
+  // the copy may be needed.
+  if (DoesRetainableObjPtrEscape(Inst))
+    return false;
+
+  // Otherwise, it's not needed.
+  return true;
+}
+
+Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
+  if (!RetainRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainRVCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
+  if (!AutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    AutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
+                             Attribute);
+  }
+  return AutoreleaseRVCallee;
+}
+
+Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
+  if (!ReleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    ReleaseCallee =
+      M->getOrInsertFunction(
+        "objc_release",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return ReleaseCallee;
+}
+
+Constant *ObjCARCOpt::getRetainCallee(Module *M) {
+  if (!RetainCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainCallee =
+      M->getOrInsertFunction(
+        "objc_retain",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return RetainCallee;
+}
+
+Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
+  if (!RetainBlockCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    // objc_retainBlock is not nounwind because it calls user copy constructors
+    // which could theoretically throw.
+    RetainBlockCallee =
+      M->getOrInsertFunction(
+        "objc_retainBlock",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        AttributeSet());
+  }
+  return RetainBlockCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
+  if (!AutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    AutoreleaseCallee =
+      M->getOrInsertFunction(
+        "objc_autorelease",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return AutoreleaseCallee;
+}
+
+/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
+/// return value.
+void
+ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
+  ImmutableCallSite CS(GetObjCArg(Retain));
+  const Instruction *Call = CS.getInstruction();
+  if (!Call) return;
+  if (Call->getParent() != Retain->getParent()) return;
+
+  // Check that the call is next to the retain.
+  BasicBlock::const_iterator I = Call;
+  ++I;
+  while (isNoopInstruction(I)) ++I;
+  if (&*I != Retain)
+    return;
+
+  // Turn it to an objc_retainAutoreleasedReturnValue..
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming "
+                  "objc_retain => objc_retainAutoreleasedReturnValue"
+                  " since the operand is a return value.\n"
+                  "                                Old: "
+               << *Retain << "\n");
+
+  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                New: "
+               << *Retain << "\n");
+}
+
+/// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
+/// not a return value.  Or, if it can be paired with an
+/// objc_autoreleaseReturnValue, delete the pair and return true.
+bool
+ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
+  // Check for the argument being from an immediately preceding call or invoke.
+  const Value *Arg = GetObjCArg(RetainRV);
+  ImmutableCallSite CS(Arg);
+  if (const Instruction *Call = CS.getInstruction()) {
+    if (Call->getParent() == RetainRV->getParent()) {
+      BasicBlock::const_iterator I = Call;
+      ++I;
+      while (isNoopInstruction(I)) ++I;
+      if (&*I == RetainRV)
+        return false;
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      BasicBlock *RetainRVParent = RetainRV->getParent();
+      if (II->getNormalDest() == RetainRVParent) {
+        BasicBlock::const_iterator I = RetainRVParent->begin();
+        while (isNoopInstruction(I)) ++I;
+        if (&*I == RetainRV)
+          return false;
+      }
+    }
+  }
+
+  // Check for being preceded by an objc_autoreleaseReturnValue on the same
+  // pointer. In this case, we can delete the pair.
+  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
+  if (I != Begin) {
+    do --I; while (I != Begin && isNoopInstruction(I));
+    if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
+        GetObjCArg(I) == Arg) {
+      Changed = true;
+      ++NumPeeps;
+
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n"
+                   << "                                  Erasing " << *RetainRV
+                   << "\n");
+
+      EraseInstruction(I);
+      EraseInstruction(RetainRV);
+      return true;
+    }
+  }
+
+  // Turn it to a plain objc_retain.
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming "
+                  "objc_retainAutoreleasedReturnValue => "
+                  "objc_retain since the operand is not a return value.\n"
+                  "                                  Old: "
+               << *RetainRV << "\n");
+
+  cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                  New: "
+               << *RetainRV << "\n");
+
+  return false;
+}
+
+/// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
+/// used as a return value.
+void
+ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                      InstructionClass &Class) {
+  // Check for a return of the pointer value.
+  const Value *Ptr = GetObjCArg(AutoreleaseRV);
+  SmallVector<const Value *, 2> Users;
+  Users.push_back(Ptr);
+  do {
+    Ptr = Users.pop_back_val();
+    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
+         UI != UE; ++UI) {
+      const User *I = *UI;
+      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
+        return;
+      if (isa<BitCastInst>(I))
+        Users.push_back(I);
+    }
+  } while (!Users.empty());
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming "
+                  "objc_autoreleaseReturnValue => "
+                  "objc_autorelease since its operand is not used as a return "
+                  "value.\n"
+                  "                                       Old: "
+               << *AutoreleaseRV << "\n");
+
+  CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
+  AutoreleaseRVCI->
+    setCalledFunction(getAutoreleaseCallee(F.getParent()));
+  AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
+  Class = IC_Autorelease;
+
+  DEBUG(dbgs() << "                                       New: "
+               << *AutoreleaseRV << "\n");
+
+}
+
+/// Visit each call, one at a time, and make simplifications without doing any
+/// additional analysis.
+void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+  // Reset all the flags in preparation for recomputing them.
+  UsedInThisFunction = 0;
+
+  // Visit all objc_* calls in F.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: Class: "
+          << Class << "; " << *Inst << "\n");
+
+    switch (Class) {
+    default: break;
+
+    // Delete no-op casts. These function calls have special semantics, but
+    // the semantics are entirely implemented via lowering in the front-end,
+    // so by the time they reach the optimizer, they are just no-op calls
+    // which return their argument.
+    //
+    // There are gray areas here, as the ability to cast reference-counted
+    // pointers to raw void* and back allows code to break ARC assumptions,
+    // however these are currently considered to be unimportant.
+    case IC_NoopCast:
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:"
+                   " " << *Inst << "\n");
+      EraseInstruction(Inst);
+      continue;
+
+    // If the pointer-to-weak-pointer is null, it's undefined behavior.
+    case IC_StoreWeak:
+    case IC_LoadWeak:
+    case IC_LoadWeakRetained:
+    case IC_InitWeak:
+    case IC_DestroyWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (isNullOrUndef(CI->getArgOperand(0))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
+                        "pointer-to-weak-pointer is undefined behavior.\n"
+                        "                                     Old = " << *CI <<
+                        "\n                                     New = " <<
+                        *NewValue << "\n");
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_CopyWeak:
+    case IC_MoveWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (isNullOrUndef(CI->getArgOperand(0)) ||
+          isNullOrUndef(CI->getArgOperand(1))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
+                        "pointer-to-weak-pointer is undefined behavior.\n"
+                        "                                     Old = " << *CI <<
+                        "\n                                     New = " <<
+                        *NewValue << "\n");
+
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_Retain:
+      OptimizeRetainCall(F, Inst);
+      break;
+    case IC_RetainRV:
+      if (OptimizeRetainRVCall(F, Inst))
+        continue;
+      break;
+    case IC_AutoreleaseRV:
+      OptimizeAutoreleaseRVCall(F, Inst, Class);
+      break;
+    }
+
+    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
+    if (IsAutorelease(Class) && Inst->use_empty()) {
+      CallInst *Call = cast<CallInst>(Inst);
+      const Value *Arg = Call->getArgOperand(0);
+      Arg = FindSingleUseIdentifiedObject(Arg);
+      if (Arg) {
+        Changed = true;
+        ++NumAutoreleases;
+
+        // Create the declaration lazily.
+        LLVMContext &C = Inst->getContext();
+        CallInst *NewCall =
+          CallInst::Create(getReleaseCallee(F.getParent()),
+                           Call->getArgOperand(0), "", Call);
+        NewCall->setMetadata(ImpreciseReleaseMDKind,
+                             MDNode::get(C, ArrayRef<Value *>()));
+
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing "
+                        "objc_autorelease(x) with objc_release(x) since x is "
+                        "otherwise unused.\n"
+                        "                                     Old: " << *Call <<
+                        "\n                                     New: " <<
+                        *NewCall << "\n");
+
+        EraseInstruction(Call);
+        Inst = NewCall;
+        Class = IC_Release;
+      }
+    }
+
+    // For functions which can never be passed stack arguments, add
+    // a tail keyword.
+    if (IsAlwaysTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword"
+            " to function since it can never be passed stack args: " << *Inst <<
+            "\n");
+      cast<CallInst>(Inst)->setTailCall();
+    }
+
+    // Ensure that functions that can never have a "tail" keyword due to the
+    // semantics of ARC truly do not do so.
+    if (IsNeverTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Removing tail "
+            "keyword from function: " << *Inst <<
+            "\n");
+      cast<CallInst>(Inst)->setTailCall(false);
+    }
+
+    // Set nounwind as needed.
+    if (IsNoThrow(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw"
+            " class. Setting nounwind on: " << *Inst << "\n");
+      cast<CallInst>(Inst)->setDoesNotThrow();
+    }
+
+    if (!IsNoopOnNull(Class)) {
+      UsedInThisFunction |= 1 << Class;
+      continue;
+    }
+
+    const Value *Arg = GetObjCArg(Inst);
+
+    // ARC calls with null are no-ops. Delete them.
+    if (isNullOrUndef(Arg)) {
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with "
+            " null are no-ops. Erasing: " << *Inst << "\n");
+      EraseInstruction(Inst);
+      continue;
+    }
+
+    // Keep track of which of retain, release, autorelease, and retain_block
+    // are actually present in this function.
+    UsedInThisFunction |= 1 << Class;
+
+    // If Arg is a PHI, and one or more incoming values to the
+    // PHI are null, and the call is control-equivalent to the PHI, and there
+    // are no relevant side effects between the PHI and the call, the call
+    // could be pushed up to just those paths with non-null incoming values.
+    // For now, don't bother splitting critical edges for this.
+    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
+    Worklist.push_back(std::make_pair(Inst, Arg));
+    do {
+      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
+      Inst = Pair.first;
+      Arg = Pair.second;
+
+      const PHINode *PN = dyn_cast<PHINode>(Arg);
+      if (!PN) continue;
+
+      // Determine if the PHI has any null operands, or any incoming
+      // critical edges.
+      bool HasNull = false;
+      bool HasCriticalEdges = false;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        Value *Incoming =
+          StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+        if (isNullOrUndef(Incoming))
+          HasNull = true;
+        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
+                   .getNumSuccessors() != 1) {
+          HasCriticalEdges = true;
+          break;
+        }
+      }
+      // If we have null operands and no critical edges, optimize.
+      if (!HasCriticalEdges && HasNull) {
+        SmallPtrSet<Instruction *, 4> DependingInstructions;
+        SmallPtrSet<const BasicBlock *, 4> Visited;
+
+        // Check that there is nothing that cares about the reference
+        // count between the call and the phi.
+        switch (Class) {
+        case IC_Retain:
+        case IC_RetainBlock:
+          // These can always be moved up.
+          break;
+        case IC_Release:
+          // These can't be moved across things that care about the retain
+          // count.
+          FindDependencies(NeedsPositiveRetainCount, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_Autorelease:
+          // These can't be moved across autorelease pool scope boundaries.
+          FindDependencies(AutoreleasePoolBoundary, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_RetainRV:
+        case IC_AutoreleaseRV:
+          // Don't move these; the RV optimization depends on the autoreleaseRV
+          // being tail called, and the retainRV being immediately after a call
+          // (which might still happen if we get lucky with codegen layout, but
+          // it's not worth taking the chance).
+          continue;
+        default:
+          llvm_unreachable("Invalid dependence flavor");
+        }
+
+        if (DependingInstructions.size() == 1 &&
+            *DependingInstructions.begin() == PN) {
+          Changed = true;
+          ++NumPartialNoops;
+          // Clone the call into each predecessor that has a non-null value.
+          CallInst *CInst = cast<CallInst>(Inst);
+          Type *ParamTy = CInst->getArgOperand(0)->getType();
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+            Value *Incoming =
+              StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+            if (!isNullOrUndef(Incoming)) {
+              CallInst *Clone = cast<CallInst>(CInst->clone());
+              Value *Op = PN->getIncomingValue(i);
+              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+              if (Op->getType() != ParamTy)
+                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
+              Clone->setArgOperand(0, Op);
+              Clone->insertBefore(InsertPos);
+
+              DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Cloning "
+                           << *CInst << "\n"
+                           "                                     And inserting "
+                           "clone at " << *InsertPos << "\n");
+              Worklist.push_back(std::make_pair(Clone, Incoming));
+            }
+          }
+          // Erase the original call.
+          DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+          EraseInstruction(CInst);
+          continue;
+        }
+      }
+    } while (!Worklist.empty());
+  }
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished List.\n");
+}
+
+/// Check for critical edges, loop boundaries, irreducible control flow, or
+/// other CFG structures where moving code across the edge would result in it
+/// being executed more.
+void
+ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BBState &MyStates) const {
+  // If any top-down local-use or possible-dec has a succ which is earlier in
+  // the sequence, forget it.
+  for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
+       E = MyStates.top_down_ptr_end(); I != E; ++I)
+    switch (I->second.GetSeq()) {
+    default: break;
+    case S_Use: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        switch (SuccSSeq) {
+        case S_None:
+        case S_CanRelease: {
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+            S.ClearSequenceProgress();
+            break;
+          }
+          continue;
+        }
+        case S_Use:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+            AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        S.ClearSequenceProgress();
+      break;
+    }
+    case S_CanRelease: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        switch (SuccSSeq) {
+        case S_None: {
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+            S.ClearSequenceProgress();
+            break;
+          }
+          continue;
+        }
+        case S_CanRelease:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+        case S_Use:
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+            AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        S.ClearSequenceProgress();
+      break;
+    }
+    }
+}
+
+bool
+ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
+                                     BasicBlock *BB,
+                                     MapVector<Value *, RRInfo> &Retains,
+                                     BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+
+    // If we see two releases in a row on the same pointer. If so, make
+    // a note, and we'll cicle back to revisit it after we've
+    // hopefully eliminated the second release, which may allow us to
+    // eliminate the first release too.
+    // Theoretically we could implement removal of nested retain+release
+    // pairs by making PtrState hold a stack of states, but this is
+    // simple and avoids adding overhead for the non-nested case.
+    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) {
+      DEBUG(dbgs() << "ObjCARCOpt::VisitInstructionBottomUp: Found nested "
+                      "releases (i.e. a release pair)\n");
+      NestingDetected = true;
+    }
+
+    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+    S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
+    S.RRI.ReleaseMetadata = ReleaseMetadata;
+    S.RRI.KnownSafe = S.IsKnownIncremented();
+    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+    S.RRI.Calls.insert(Inst);
+
+    S.SetKnownPositiveRefCount();
+    break;
+  }
+  case IC_RetainBlock:
+    // An objc_retainBlock call with just a use may need to be kept,
+    // because it may be copying a block from the stack to the heap.
+    if (!IsRetainBlockOptimizable(Inst))
+      break;
+    // FALLTHROUGH
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+    S.SetKnownPositiveRefCount();
+
+    switch (S.GetSeq()) {
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+    case S_Use:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_CanRelease:
+      // Don't do retain+release tracking for IC_RetainRV, because it's
+      // better to let it remain as the first instruction after a call.
+      if (Class != IC_RetainRV) {
+        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+        Retains[Inst] = S.RRI;
+      }
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+    return NestingDetected;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearBottomUpPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
+       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.ClearRefCount();
+      switch (Seq) {
+      case S_Use:
+        S.SetSeq(S_CanRelease);
+        continue;
+      case S_CanRelease:
+      case S_Release:
+      case S_MovableRelease:
+      case S_Stop:
+      case S_None:
+        break;
+      case S_Retain:
+        llvm_unreachable("bottom-up pointer in retain state!");
+      }
+    }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_Release:
+    case S_MovableRelease:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        assert(S.RRI.ReverseInsertPts.empty());
+        // If this is an invoke instruction, we're scanning it as part of
+        // one of its successor blocks, since we can't insert code after it
+        // in its own block, and we don't want to split critical edges.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+        S.SetSeq(S_Use);
+      } else if (Seq == S_Release &&
+                 (Class == IC_User || Class == IC_CallOrUser)) {
+        // Non-movable releases depend on any possible objc pointer use.
+        S.SetSeq(S_Stop);
+        assert(S.RRI.ReverseInsertPts.empty());
+        // As above; handle invoke specially.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+      }
+      break;
+    case S_Stop:
+      if (CanUse(Inst, Ptr, PA, Class))
+        S.SetSeq(S_Use);
+      break;
+    case S_CanRelease:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
+                          DenseMap<const BasicBlock *, BBState> &BBStates,
+                          MapVector<Value *, RRInfo> &Retains) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each successor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
+    const BasicBlock *Succ = *SI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+    assert(I != BBStates.end());
+    MyStates.InitFromSucc(I->second);
+    ++SI;
+    for (; SI != SE; ++SI) {
+      Succ = *SI;
+      I = BBStates.find(Succ);
+      assert(I != BBStates.end());
+      MyStates.MergeSucc(I->second);
+    }
+  }
+
+  // Visit all the instructions, bottom-up.
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
+    Instruction *Inst = llvm::prior(I);
+
+    // Invoke instructions are visited as part of their successors (below).
+    if (isa<InvokeInst>(Inst))
+      continue;
+
+    DEBUG(dbgs() << "ObjCARCOpt::VisitButtonUp: Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
+  }
+
+  // If there's a predecessor with an invoke, visit the invoke as if it were
+  // part of this block, since we can't insert code after an invoke in its own
+  // block, and we don't want to split critical edges.
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
+      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
+                                    DenseMap<Value *, RRInfo> &Releases,
+                                    BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_RetainBlock:
+    // An objc_retainBlock call with just a use may need to be kept,
+    // because it may be copying a block from the stack to the heap.
+    if (!IsRetainBlockOptimizable(Inst))
+      break;
+    // FALLTHROUGH
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+
+    // Don't do retain+release tracking for IC_RetainRV, because it's
+    // better to let it remain as the first instruction after a call.
+    if (Class != IC_RetainRV) {
+      // If we see two retains in a row on the same pointer. If so, make
+      // a note, and we'll cicle back to revisit it after we've
+      // hopefully eliminated the second retain, which may allow us to
+      // eliminate the first retain too.
+      // Theoretically we could implement removal of nested retain+release
+      // pairs by making PtrState hold a stack of states, but this is
+      // simple and avoids adding overhead for the non-nested case.
+      if (S.GetSeq() == S_Retain)
+        NestingDetected = true;
+
+      S.ResetSequenceProgress(S_Retain);
+      S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+      S.RRI.KnownSafe = S.IsKnownIncremented();
+      S.RRI.Calls.insert(Inst);
+    }
+
+    S.SetKnownPositiveRefCount();
+
+    // A retain can be a potential use; procede to the generic checking
+    // code below.
+    break;
+  }
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+    S.ClearRefCount();
+
+    switch (S.GetSeq()) {
+    case S_Retain:
+    case S_CanRelease:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_Use:
+      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+      Releases[Inst] = S.RRI;
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
+    break;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearTopDownPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
+       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.ClearRefCount();
+      switch (Seq) {
+      case S_Retain:
+        S.SetSeq(S_CanRelease);
+        assert(S.RRI.ReverseInsertPts.empty());
+        S.RRI.ReverseInsertPts.insert(Inst);
+
+        // One call can't cause a transition from S_Retain to S_CanRelease
+        // and S_CanRelease to S_Use. If we've made the first transition,
+        // we're done.
+        continue;
+      case S_Use:
+      case S_CanRelease:
+      case S_None:
+        break;
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        llvm_unreachable("top-down pointer in release state!");
+      }
+    }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_CanRelease:
+      if (CanUse(Inst, Ptr, PA, Class))
+        S.SetSeq(S_Use);
+      break;
+    case S_Retain:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitTopDown(BasicBlock *BB,
+                         DenseMap<const BasicBlock *, BBState> &BBStates,
+                         DenseMap<Value *, RRInfo> &Releases) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each predecessor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
+    const BasicBlock *Pred = *PI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+    assert(I != BBStates.end());
+    MyStates.InitFromPred(I->second);
+    ++PI;
+    for (; PI != PE; ++PI) {
+      Pred = *PI;
+      I = BBStates.find(Pred);
+      assert(I != BBStates.end());
+      MyStates.MergePred(I->second);
+    }
+  }
+
+  // Visit all the instructions, top-down.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    Instruction *Inst = I;
+
+    DEBUG(dbgs() << "ObjCARCOpt::VisitTopDown: Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
+  }
+
+  CheckForCFGHazards(BB, BBStates, MyStates);
+  return NestingDetected;
+}
+
+static void
+ComputePostOrders(Function &F,
+                  SmallVectorImpl<BasicBlock *> &PostOrder,
+                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
+                  unsigned NoObjCARCExceptionsMDKind,
+                  DenseMap<const BasicBlock *, BBState> &BBStates) {
+  /// The visited set, for doing DFS walks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  // Do DFS, computing the PostOrder.
+  SmallPtrSet<BasicBlock *, 16> OnStack;
+  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+
+  // Functions always have exactly one entry block, and we don't have
+  // any other block that we treat like an entry block.
+  BasicBlock *EntryBB = &F.getEntryBlock();
+  BBState &MyStates = BBStates[EntryBB];
+  MyStates.SetAsEntry();
+  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
+  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
+  Visited.insert(EntryBB);
+  OnStack.insert(EntryBB);
+  do {
+  dfs_next_succ:
+    BasicBlock *CurrBB = SuccStack.back().first;
+    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
+    succ_iterator SE(TI, false);
+
+    while (SuccStack.back().second != SE) {
+      BasicBlock *SuccBB = *SuccStack.back().second++;
+      if (Visited.insert(SuccBB)) {
+        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
+        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBState &SuccStates = BBStates[SuccBB];
+        SuccStates.addPred(CurrBB);
+        OnStack.insert(SuccBB);
+        goto dfs_next_succ;
+      }
+
+      if (!OnStack.count(SuccBB)) {
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBStates[SuccBB].addPred(CurrBB);
+      }
+    }
+    OnStack.erase(CurrBB);
+    PostOrder.push_back(CurrBB);
+    SuccStack.pop_back();
+  } while (!SuccStack.empty());
+
+  Visited.clear();
+
+  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+  // Functions may have many exits, and there also blocks which we treat
+  // as exits due to ignored edges.
+  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *ExitBB = I;
+    BBState &MyStates = BBStates[ExitBB];
+    if (!MyStates.isExit())
+      continue;
+
+    MyStates.SetAsExit();
+
+    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
+    Visited.insert(ExitBB);
+    while (!PredStack.empty()) {
+    reverse_dfs_next_succ:
+      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
+      while (PredStack.back().second != PE) {
+        BasicBlock *BB = *PredStack.back().second++;
+        if (Visited.insert(BB)) {
+          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
+          goto reverse_dfs_next_succ;
+        }
+      }
+      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
+    }
+  }
+}
+
+// Visit the function both top-down and bottom-up.
+bool
+ObjCARCOpt::Visit(Function &F,
+                  DenseMap<const BasicBlock *, BBState> &BBStates,
+                  MapVector<Value *, RRInfo> &Retains,
+                  DenseMap<Value *, RRInfo> &Releases) {
+
+  // Use reverse-postorder traversals, because we magically know that loops
+  // will be well behaved, i.e. they won't repeatedly call retain on a single
+  // pointer without doing a release. We can't use the ReversePostOrderTraversal
+  // class here because we want the reverse-CFG postorder to consider each
+  // function exit point, and we want to ignore selected cycle edges.
+  SmallVector<BasicBlock *, 16> PostOrder;
+  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
+  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
+                    NoObjCARCExceptionsMDKind,
+                    BBStates);
+
+  // Use reverse-postorder on the reverse CFG for bottom-up.
+  bool BottomUpNestingDetected = false;
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+       ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend();
+       I != E; ++I)
+    BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains);
+
+  // Use reverse-postorder for top-down.
+  bool TopDownNestingDetected = false;
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+       PostOrder.rbegin(), E = PostOrder.rend();
+       I != E; ++I)
+    TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases);
+
+  return TopDownNestingDetected && BottomUpNestingDetected;
+}
+
+/// Move the calls in RetainsToMove and ReleasesToMove.
+void ObjCARCOpt::MoveCalls(Value *Arg,
+                           RRInfo &RetainsToMove,
+                           RRInfo &ReleasesToMove,
+                           MapVector<Value *, RRInfo> &Retains,
+                           DenseMap<Value *, RRInfo> &Releases,
+                           SmallVectorImpl<Instruction *> &DeadInsts,
+                           Module *M) {
+  Type *ArgTy = Arg->getType();
+  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
+
+  // Insert the new retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = ReleasesToMove.ReverseInsertPts.begin(),
+       PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call =
+      CallInst::Create(RetainsToMove.IsRetainBlock ?
+                         getRetainBlockCallee(M) : getRetainCallee(M),
+                       MyArg, "", InsertPt);
+    Call->setDoesNotThrow();
+    if (RetainsToMove.IsRetainBlock)
+      Call->setMetadata(CopyOnEscapeMDKind,
+                        MDNode::get(M->getContext(), ArrayRef<Value *>()));
+    else
+      Call->setTailCall();
+
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Release: " << *Call
+                 << "\n"
+                    "                       At insertion point: " << *InsertPt
+                 << "\n");
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = RetainsToMove.ReverseInsertPts.begin(),
+       PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
+                                      "", InsertPt);
+    // Attach a clang.imprecise_release metadata tag, if appropriate.
+    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+      Call->setMetadata(ImpreciseReleaseMDKind, M);
+    Call->setDoesNotThrow();
+    if (ReleasesToMove.IsTailCallRelease)
+      Call->setTailCall();
+
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Retain: " << *Call
+                 << "\n"
+                    "                       At insertion point: " << *InsertPt
+                 << "\n");
+  }
+
+  // Delete the original retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = RetainsToMove.Calls.begin(),
+       AE = RetainsToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRetain = *AI;
+    Retains.blot(OrigRetain);
+    DeadInsts.push_back(OrigRetain);
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting retain: " << *OrigRetain <<
+                    "\n");
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = ReleasesToMove.Calls.begin(),
+       AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRelease = *AI;
+    Releases.erase(OrigRelease);
+    DeadInsts.push_back(OrigRelease);
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting release: " << *OrigRelease
+                 << "\n");
+  }
+}
+
+bool
+ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
+                                    &BBStates,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  DenseMap<Value *, RRInfo> &Releases,
+                                  Module *M,
+                                  SmallVector<Instruction *, 4> &NewRetains,
+                                  SmallVector<Instruction *, 4> &NewReleases,
+                                  SmallVector<Instruction *, 8> &DeadInsts,
+                                  RRInfo &RetainsToMove,
+                                  RRInfo &ReleasesToMove,
+                                  Value *Arg,
+                                  bool KnownSafe,
+                                  bool &AnyPairsCompletelyEliminated) {
+  // If a pair happens in a region where it is known that the reference count
+  // is already incremented, we can similarly ignore possible decrements.
+  bool KnownSafeTD = true, KnownSafeBU = true;
+
+  // Connect the dots between the top-down-collected RetainsToMove and
+  // bottom-up-collected ReleasesToMove to form sets of related calls.
+  // This is an iterative process so that we connect multiple releases
+  // to multiple retains if needed.
+  unsigned OldDelta = 0;
+  unsigned NewDelta = 0;
+  unsigned OldCount = 0;
+  unsigned NewCount = 0;
+  bool FirstRelease = true;
+  bool FirstRetain = true;
+  for (;;) {
+    for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
+      Instruction *NewRetain = *NI;
+      MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
+      assert(It != Retains.end());
+      const RRInfo &NewRetainRRI = It->second;
+      KnownSafeTD &= NewRetainRRI.KnownSafe;
+      for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewRetainRRI.Calls.begin(),
+             LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
+        Instruction *NewRetainRelease = *LI;
+        DenseMap<Value *, RRInfo>::const_iterator Jt =
+          Releases.find(NewRetainRelease);
+        if (Jt == Releases.end())
+          return false;
+        const RRInfo &NewRetainReleaseRRI = Jt->second;
+        assert(NewRetainReleaseRRI.Calls.count(NewRetain));
+        if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
+          OldDelta -=
+            BBStates[NewRetainRelease->getParent()].GetAllPathCount();
+
+          // Merge the ReleaseMetadata and IsTailCallRelease values.
+          if (FirstRelease) {
+            ReleasesToMove.ReleaseMetadata =
+              NewRetainReleaseRRI.ReleaseMetadata;
+            ReleasesToMove.IsTailCallRelease =
+              NewRetainReleaseRRI.IsTailCallRelease;
+            FirstRelease = false;
+          } else {
+            if (ReleasesToMove.ReleaseMetadata !=
+                NewRetainReleaseRRI.ReleaseMetadata)
+              ReleasesToMove.ReleaseMetadata = 0;
+            if (ReleasesToMove.IsTailCallRelease !=
+                NewRetainReleaseRRI.IsTailCallRelease)
+              ReleasesToMove.IsTailCallRelease = false;
+          }
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewRetainReleaseRRI.ReverseInsertPts.begin(),
+                   RE = NewRetainReleaseRRI.ReverseInsertPts.end();
+                 RI != RE; ++RI) {
+              Instruction *RIP = *RI;
+              if (ReleasesToMove.ReverseInsertPts.insert(RIP))
+                NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
+            }
+          NewReleases.push_back(NewRetainRelease);
+        }
+      }
+    }
+    NewRetains.clear();
+    if (NewReleases.empty()) break;
+
+    // Back the other way.
+    for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
+      Instruction *NewRelease = *NI;
+      DenseMap<Value *, RRInfo>::const_iterator It =
+        Releases.find(NewRelease);
+      assert(It != Releases.end());
+      const RRInfo &NewReleaseRRI = It->second;
+      KnownSafeBU &= NewReleaseRRI.KnownSafe;
+      for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewReleaseRRI.Calls.begin(),
+             LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
+        Instruction *NewReleaseRetain = *LI;
+        MapVector<Value *, RRInfo>::const_iterator Jt =
+          Retains.find(NewReleaseRetain);
+        if (Jt == Retains.end())
+          return false;
+        const RRInfo &NewReleaseRetainRRI = Jt->second;
+        assert(NewReleaseRetainRRI.Calls.count(NewRelease));
+        if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
+          unsigned PathCount =
+            BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
+          OldDelta += PathCount;
+          OldCount += PathCount;
+
+          // Merge the IsRetainBlock values.
+          if (FirstRetain) {
+            RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock;
+            FirstRetain = false;
+          } else if (ReleasesToMove.IsRetainBlock !=
+                     NewReleaseRetainRRI.IsRetainBlock)
+            // It's not possible to merge the sequences if one uses
+            // objc_retain and the other uses objc_retainBlock.
+            return false;
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewReleaseRetainRRI.ReverseInsertPts.begin(),
+                   RE = NewReleaseRetainRRI.ReverseInsertPts.end();
+                 RI != RE; ++RI) {
+              Instruction *RIP = *RI;
+              if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
+                PathCount = BBStates[RIP->getParent()].GetAllPathCount();
+                NewDelta += PathCount;
+                NewCount += PathCount;
+              }
+            }
+          NewRetains.push_back(NewReleaseRetain);
+        }
+      }
+    }
+    NewReleases.clear();
+    if (NewRetains.empty()) break;
+  }
+
+  // If the pointer is known incremented or nested, we can safely delete the
+  // pair regardless of what's between them.
+  if (KnownSafeTD || KnownSafeBU) {
+    RetainsToMove.ReverseInsertPts.clear();
+    ReleasesToMove.ReverseInsertPts.clear();
+    NewCount = 0;
+  } else {
+    // Determine whether the new insertion points we computed preserve the
+    // balance of retain and release calls through the program.
+    // TODO: If the fully aggressive solution isn't valid, try to find a
+    // less aggressive solution which is.
+    if (NewDelta != 0)
+      return false;
+  }
+
+  // Determine whether the original call points are balanced in the retain and
+  // release calls through the program. If not, conservatively don't touch
+  // them.
+  // TODO: It's theoretically possible to do code motion in this case, as
+  // long as the existing imbalances are maintained.
+  if (OldDelta != 0)
+    return false;
+
+  Changed = true;
+  assert(OldCount != 0 && "Unreachable code?");
+  NumRRs += OldCount - NewCount;
+  // Set to true if we completely removed any RR pairs.
+  AnyPairsCompletelyEliminated = NewCount == 0;
+
+  // We can move calls!
+  return true;
+}
+
+/// Identify pairings between the retains and releases, and delete and/or move
+/// them.
+bool
+ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
+                                   &BBStates,
+                                 MapVector<Value *, RRInfo> &Retains,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 Module *M) {
+  bool AnyPairsCompletelyEliminated = false;
+  RRInfo RetainsToMove;
+  RRInfo ReleasesToMove;
+  SmallVector<Instruction *, 4> NewRetains;
+  SmallVector<Instruction *, 4> NewReleases;
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  // Visit each retain.
+  for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
+       E = Retains.end(); I != E; ++I) {
+    Value *V = I->first;
+    if (!V) continue; // blotted
+
+    Instruction *Retain = cast<Instruction>(V);
+
+    DEBUG(dbgs() << "ObjCARCOpt::PerformCodePlacement: Visiting: " << *Retain
+          << "\n");
+
+    Value *Arg = GetObjCArg(Retain);
+
+    // If the object being released is in static or stack storage, we know it's
+    // not being managed by ObjC reference counting, so we can delete pairs
+    // regardless of what possible decrements or uses lie between them.
+    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+
+    // A constant pointer can't be pointing to an object on the heap. It may
+    // be reference-counted, but it won't be deleted.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+      if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(
+              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
+        if (GV->isConstant())
+          KnownSafe = true;
+
+    // Connect the dots between the top-down-collected RetainsToMove and
+    // bottom-up-collected ReleasesToMove to form sets of related calls.
+    NewRetains.push_back(Retain);
+    bool PerformMoveCalls =
+      ConnectTDBUTraversals(BBStates, Retains, Releases, M, NewRetains,
+                            NewReleases, DeadInsts, RetainsToMove,
+                            ReleasesToMove, Arg, KnownSafe,
+                            AnyPairsCompletelyEliminated);
+
+    if (PerformMoveCalls) {
+      // Ok, everything checks out and we're all set. Let's move/delete some
+      // code!
+      MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+                Retains, Releases, DeadInsts, M);
+    }
+
+    // Clean up state for next retain.
+    NewReleases.clear();
+    NewRetains.clear();
+    RetainsToMove.clear();
+    ReleasesToMove.clear();
+  }
+
+  // Now that we're done moving everything, we can delete the newly dead
+  // instructions, as we no longer need them as insert points.
+  while (!DeadInsts.empty())
+    EraseInstruction(DeadInsts.pop_back_val());
+
+  return AnyPairsCompletelyEliminated;
+}
+
+/// Weak pointer optimizations.
+void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
+  // itself because it uses AliasAnalysis and we need to do provenance
+  // queries instead.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst <<
+          "\n");
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
+      continue;
+
+    // Delete objc_loadWeak calls with no users.
+    if (Class == IC_LoadWeak && Inst->use_empty()) {
+      Inst->eraseFromParent();
+      continue;
+    }
+
+    // TODO: For now, just look for an earlier available version of this value
+    // within the same block. Theoretically, we could do memdep-style non-local
+    // analysis too, but that would want caching. A better approach would be to
+    // use the technique that EarlyCSE uses.
+    inst_iterator Current = llvm::prior(I);
+    BasicBlock *CurrentBB = Current.getBasicBlockIterator();
+    for (BasicBlock::iterator B = CurrentBB->begin(),
+                              J = Current.getInstructionIterator();
+         J != B; --J) {
+      Instruction *EarlierInst = &*llvm::prior(J);
+      InstructionClass EarlierClass = GetInstructionClass(EarlierInst);
+      switch (EarlierClass) {
+      case IC_LoadWeak:
+      case IC_LoadWeakRetained: {
+        // If this is loading from the same pointer, replace this load's value
+        // with that one.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall);
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_StoreWeak:
+      case IC_InitWeak: {
+        // If this is storing to the same pointer and has the same size etc.
+        // replace this load's value with the stored value.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_MoveWeak:
+      case IC_CopyWeak:
+        // TOOD: Grab the copied value.
+        goto clobbered;
+      case IC_AutoreleasepoolPush:
+      case IC_None:
+      case IC_User:
+        // Weak pointers are only modified through the weak entry points
+        // (and arbitrary calls, which could call the weak entry points).
+        break;
+      default:
+        // Anything else could modify the weak pointer.
+        goto clobbered;
+      }
+    }
+  clobbered:;
+  }
+
+  // Then, for each destroyWeak with an alloca operand, check to see if
+  // the alloca and all its users can be zapped.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_DestroyWeak)
+      continue;
+
+    CallInst *Call = cast<CallInst>(Inst);
+    Value *Arg = Call->getArgOperand(0);
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ++UI) {
+        const Instruction *UserInst = cast<Instruction>(*UI);
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+        case IC_DestroyWeak:
+          continue;
+        default:
+          goto done;
+        }
+      }
+      Changed = true;
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ) {
+        CallInst *UserInst = cast<CallInst>(*UI++);
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+          // These functions return their second argument.
+          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
+          break;
+        case IC_DestroyWeak:
+          // No return value.
+          break;
+        default:
+          llvm_unreachable("alloca really is used!");
+        }
+        UserInst->eraseFromParent();
+      }
+      Alloca->eraseFromParent();
+    done:;
+    }
+  }
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n");
+
+}
+
+/// Identify program paths which execute sequences of retains and releases which
+/// can be eliminated.
+bool ObjCARCOpt::OptimizeSequences(Function &F) {
+  /// Releases, Retains - These are used to store the results of the main flow
+  /// analysis. These use Value* as the key instead of Instruction* so that the
+  /// map stays valid when we get around to rewriting code and calls get
+  /// replaced by arguments.
+  DenseMap<Value *, RRInfo> Releases;
+  MapVector<Value *, RRInfo> Retains;
+
+  /// This is used during the traversal of the function to track the
+  /// states for each identified object at each block.
+  DenseMap<const BasicBlock *, BBState> BBStates;
+
+  // Analyze the CFG of the function, and all instructions.
+  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
+
+  // Transform.
+  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
+         NestingDetected;
+}
+
+/// Look for this pattern:
+/// \code
+///    %call = call i8* @something(...)
+///    %2 = call i8* @objc_retain(i8* %call)
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+/// \endcode
+/// And delete the retain and autorelease.
+///
+/// Otherwise if it's just this:
+/// \code
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+/// \endcode
+/// convert the autorelease to autoreleaseRV.
+void ObjCARCOpt::OptimizeReturns(Function &F) {
+  if (!F.getReturnType()->isPointerTy())
+    return;
+
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    BasicBlock *BB = FI;
+    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n");
+
+    if (!Ret) continue;
+
+    const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
+    FindDependencies(NeedsPositiveRetainCount, Arg,
+                     BB, Ret, DependingInstructions, Visited, PA);
+    if (DependingInstructions.size() != 1)
+      goto next_block;
+
+    {
+      CallInst *Autorelease =
+        dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+      if (!Autorelease)
+        goto next_block;
+      InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
+      if (!IsAutorelease(AutoreleaseClass))
+        goto next_block;
+      if (GetObjCArg(Autorelease) != Arg)
+        goto next_block;
+
+      DependingInstructions.clear();
+      Visited.clear();
+
+      // Check that there is nothing that can affect the reference
+      // count between the autorelease and the retain.
+      FindDependencies(CanChangeRetainCount, Arg,
+                       BB, Autorelease, DependingInstructions, Visited, PA);
+      if (DependingInstructions.size() != 1)
+        goto next_block;
+
+      {
+        CallInst *Retain =
+          dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+
+        // Check that we found a retain with the same argument.
+        if (!Retain ||
+            !IsRetain(GetBasicInstructionClass(Retain)) ||
+            GetObjCArg(Retain) != Arg)
+          goto next_block;
+
+        DependingInstructions.clear();
+        Visited.clear();
+
+        // Convert the autorelease to an autoreleaseRV, since it's
+        // returning the value.
+        if (AutoreleaseClass == IC_Autorelease) {
+          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Converting autorelease "
+                          "=> autoreleaseRV since it's returning a value.\n"
+                          "                             In: " << *Autorelease
+                       << "\n");
+          Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent()));
+          DEBUG(dbgs() << "                             Out: " << *Autorelease
+                       << "\n");
+          Autorelease->setTailCall(); // Always tail call autoreleaseRV.
+          AutoreleaseClass = IC_AutoreleaseRV;
+        }
+
+        // Check that there is nothing that can affect the reference
+        // count between the retain and the call.
+        // Note that Retain need not be in BB.
+        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
+                         DependingInstructions, Visited, PA);
+        if (DependingInstructions.size() != 1)
+          goto next_block;
+
+        {
+          CallInst *Call =
+            dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+
+          // Check that the pointer is the return value of the call.
+          if (!Call || Arg != Call)
+            goto next_block;
+
+          // Check that the call is a regular call.
+          InstructionClass Class = GetBasicInstructionClass(Call);
+          if (Class != IC_CallOrUser && Class != IC_Call)
+            goto next_block;
+
+          // If so, we can zap the retain and autorelease.
+          Changed = true;
+          ++NumRets;
+          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain
+                       << "\n                             Erasing: "
+                       << *Autorelease << "\n");
+          EraseInstruction(Retain);
+          EraseInstruction(Autorelease);
+        }
+      }
+    }
+
+  next_block:
+    DependingInstructions.clear();
+    Visited.clear();
+  }
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n");
+
+}
+
+bool ObjCARCOpt::doInitialization(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  // Identify the imprecise release metadata kind.
+  ImpreciseReleaseMDKind =
+    M.getContext().getMDKindID("clang.imprecise_release");
+  CopyOnEscapeMDKind =
+    M.getContext().getMDKindID("clang.arc.copy_on_escape");
+  NoObjCARCExceptionsMDKind =
+    M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
+
+  // Intuitively, objc_retain and others are nocapture, however in practice
+  // they are not, because they return their argument value. And objc_release
+  // calls finalizers which can have arbitrary side effects.
+
+  // These are initialized lazily.
+  RetainRVCallee = 0;
+  AutoreleaseRVCallee = 0;
+  ReleaseCallee = 0;
+  RetainCallee = 0;
+  RetainBlockCallee = 0;
+  AutoreleaseCallee = 0;
+
+  return false;
+}
+
+bool ObjCARCOpt::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+
+  DEBUG(dbgs() << "ObjCARCOpt: Visiting Function: " << F.getName() << "\n");
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // This pass performs several distinct transformations. As a compile-time aid
+  // when compiling code that isn't ObjC, skip these if the relevant ObjC
+  // library functions aren't declared.
+
+  // Preliminary optimizations. This also computs UsedInThisFunction.
+  OptimizeIndividualCalls(F);
+
+  // Optimizations for weak pointers.
+  if (UsedInThisFunction & ((1 << IC_LoadWeak) |
+                            (1 << IC_LoadWeakRetained) |
+                            (1 << IC_StoreWeak) |
+                            (1 << IC_InitWeak) |
+                            (1 << IC_CopyWeak) |
+                            (1 << IC_MoveWeak) |
+                            (1 << IC_DestroyWeak)))
+    OptimizeWeakCalls(F);
+
+  // Optimizations for retain+release pairs.
+  if (UsedInThisFunction & ((1 << IC_Retain) |
+                            (1 << IC_RetainRV) |
+                            (1 << IC_RetainBlock)))
+    if (UsedInThisFunction & (1 << IC_Release))
+      // Run OptimizeSequences until it either stops making changes or
+      // no retain+release pair nesting is detected.
+      while (OptimizeSequences(F)) {}
+
+  // Optimizations if objc_autorelease is used.
+  if (UsedInThisFunction & ((1 << IC_Autorelease) |
+                            (1 << IC_AutoreleaseRV)))
+    OptimizeReturns(F);
+
+  DEBUG(dbgs() << "\n");
+
+  return Changed;
+}
+
+void ObjCARCOpt::releaseMemory() {
+  PA.clear();
+}
+
+/// @}
+///
diff --git a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
new file mode 100644
index 0000000..a841c64
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
@@ -0,0 +1,241 @@
+//===- ObjCARCUtil.cpp - ObjC ARC Optimization --------*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines several utility functions used by various ARC
+/// optimizations which are IMHO too big to be in a header file.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
+                                       const InstructionClass Class) {
+  switch (Class) {
+  case IC_Retain:
+    return OS << "IC_Retain";
+  case IC_RetainRV:
+    return OS << "IC_RetainRV";
+  case IC_RetainBlock:
+    return OS << "IC_RetainBlock";
+  case IC_Release:
+    return OS << "IC_Release";
+  case IC_Autorelease:
+    return OS << "IC_Autorelease";
+  case IC_AutoreleaseRV:
+    return OS << "IC_AutoreleaseRV";
+  case IC_AutoreleasepoolPush:
+    return OS << "IC_AutoreleasepoolPush";
+  case IC_AutoreleasepoolPop:
+    return OS << "IC_AutoreleasepoolPop";
+  case IC_NoopCast:
+    return OS << "IC_NoopCast";
+  case IC_FusedRetainAutorelease:
+    return OS << "IC_FusedRetainAutorelease";
+  case IC_FusedRetainAutoreleaseRV:
+    return OS << "IC_FusedRetainAutoreleaseRV";
+  case IC_LoadWeakRetained:
+    return OS << "IC_LoadWeakRetained";
+  case IC_StoreWeak:
+    return OS << "IC_StoreWeak";
+  case IC_InitWeak:
+    return OS << "IC_InitWeak";
+  case IC_LoadWeak:
+    return OS << "IC_LoadWeak";
+  case IC_MoveWeak:
+    return OS << "IC_MoveWeak";
+  case IC_CopyWeak:
+    return OS << "IC_CopyWeak";
+  case IC_DestroyWeak:
+    return OS << "IC_DestroyWeak";
+  case IC_StoreStrong:
+    return OS << "IC_StoreStrong";
+  case IC_CallOrUser:
+    return OS << "IC_CallOrUser";
+  case IC_Call:
+    return OS << "IC_Call";
+  case IC_User:
+    return OS << "IC_User";
+  case IC_None:
+    return OS << "IC_None";
+  }
+  llvm_unreachable("Unknown instruction class!");
+}
+
+InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
+  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+
+  // No arguments.
+  if (AI == AE)
+    return StringSwitch<InstructionClass>(F->getName())
+      .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
+      .Default(IC_CallOrUser);
+
+  // One argument.
+  const Argument *A0 = AI++;
+  if (AI == AE)
+    // Argument is a pointer.
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
+      Type *ETy = PTy->getElementType();
+      // Argument is i8*.
+      if (ETy->isIntegerTy(8))
+        return StringSwitch<InstructionClass>(F->getName())
+          .Case("objc_retain",                IC_Retain)
+          .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV)
+          .Case("objc_retainBlock",           IC_RetainBlock)
+          .Case("objc_release",               IC_Release)
+          .Case("objc_autorelease",           IC_Autorelease)
+          .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV)
+          .Case("objc_autoreleasePoolPop",    IC_AutoreleasepoolPop)
+          .Case("objc_retainedObject",        IC_NoopCast)
+          .Case("objc_unretainedObject",      IC_NoopCast)
+          .Case("objc_unretainedPointer",     IC_NoopCast)
+          .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
+          .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
+          .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
+          .Default(IC_CallOrUser);
+
+      // Argument is i8**
+      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
+        if (Pte->getElementType()->isIntegerTy(8))
+          return StringSwitch<InstructionClass>(F->getName())
+            .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
+            .Case("objc_loadWeak",              IC_LoadWeak)
+            .Case("objc_destroyWeak",           IC_DestroyWeak)
+            .Default(IC_CallOrUser);
+    }
+
+  // Two arguments, first is i8**.
+  const Argument *A1 = AI++;
+  if (AI == AE)
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
+      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
+        if (Pte->getElementType()->isIntegerTy(8))
+          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
+            Type *ETy1 = PTy1->getElementType();
+            // Second argument is i8*
+            if (ETy1->isIntegerTy(8))
+              return StringSwitch<InstructionClass>(F->getName())
+                .Case("objc_storeWeak",             IC_StoreWeak)
+                .Case("objc_initWeak",              IC_InitWeak)
+                .Case("objc_storeStrong",           IC_StoreStrong)
+                .Default(IC_CallOrUser);
+            // Second argument is i8**.
+            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
+              if (Pte1->getElementType()->isIntegerTy(8))
+                return StringSwitch<InstructionClass>(F->getName())
+                  .Case("objc_moveWeak",              IC_MoveWeak)
+                  .Case("objc_copyWeak",              IC_CopyWeak)
+                  .Default(IC_CallOrUser);
+          }
+
+  // Anything else.
+  return IC_CallOrUser;
+}
+
+/// \brief Determine what kind of construct V is.
+InstructionClass
+llvm::objcarc::GetInstructionClass(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Any instruction other than bitcast and gep with a pointer operand have a
+    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
+    // to a subsequent use, rather than using it themselves, in this sense.
+    // As a short cut, several other opcodes are known to have no pointer
+    // operands of interest. And ret is never followed by a release, so it's
+    // not interesting to examine.
+    switch (I->getOpcode()) {
+    case Instruction::Call: {
+      const CallInst *CI = cast<CallInst>(I);
+      // Check for calls to special functions.
+      if (const Function *F = CI->getCalledFunction()) {
+        InstructionClass Class = GetFunctionClass(F);
+        if (Class != IC_CallOrUser)
+          return Class;
+
+        // None of the intrinsic functions do objc_release. For intrinsics, the
+        // only question is whether or not they may be users.
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::returnaddress: case Intrinsic::frameaddress:
+        case Intrinsic::stacksave: case Intrinsic::stackrestore:
+        case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
+        case Intrinsic::objectsize: case Intrinsic::prefetch:
+        case Intrinsic::stackprotector:
+        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
+        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
+        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
+        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
+        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
+        // Don't let dbg info affect our results.
+        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
+          // Short cut: Some intrinsics obviously don't use ObjC pointers.
+          return IC_None;
+        default:
+          break;
+        }
+      }
+      return GetCallSiteClass(CI);
+    }
+    case Instruction::Invoke:
+      return GetCallSiteClass(cast<InvokeInst>(I));
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::Select: case Instruction::PHI:
+    case Instruction::Ret: case Instruction::Br:
+    case Instruction::Switch: case Instruction::IndirectBr:
+    case Instruction::Alloca: case Instruction::VAArg:
+    case Instruction::Add: case Instruction::FAdd:
+    case Instruction::Sub: case Instruction::FSub:
+    case Instruction::Mul: case Instruction::FMul:
+    case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv:
+    case Instruction::SRem: case Instruction::URem: case Instruction::FRem:
+    case Instruction::Shl: case Instruction::LShr: case Instruction::AShr:
+    case Instruction::And: case Instruction::Or: case Instruction::Xor:
+    case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc:
+    case Instruction::IntToPtr: case Instruction::FCmp:
+    case Instruction::FPTrunc: case Instruction::FPExt:
+    case Instruction::FPToUI: case Instruction::FPToSI:
+    case Instruction::UIToFP: case Instruction::SIToFP:
+    case Instruction::InsertElement: case Instruction::ExtractElement:
+    case Instruction::ShuffleVector:
+    case Instruction::ExtractValue:
+      break;
+    case Instruction::ICmp:
+      // Comparing a pointer with null, or any other constant, isn't an
+      // interesting use, because we don't care what the pointer points to, or
+      // about the values of any other dynamic reference-counted pointers.
+      if (IsPotentialRetainableObjPtr(I->getOperand(1)))
+        return IC_User;
+      break;
+    default:
+      // For anything else, check all the operands.
+      // Note that this includes both operands of a Store: while the first
+      // operand isn't actually being dereferenced, it is being stored to
+      // memory where we can no longer track who might read it and dereference
+      // it, so we have to consider it potentially used.
+      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
+           OI != OE; ++OI)
+        if (IsPotentialRetainableObjPtr(*OI))
+          return IC_User;
+    }
+  }
+
+  // Otherwise, it's totally inert for ARC purposes.
+  return IC_None;
+}
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
new file mode 100644
index 0000000..ae3c628
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -0,0 +1,177 @@
+//===- ProvenanceAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
+                                       const Value *B) {
+  // If the values are Selects with the same condition, we can do a more precise
+  // check: just check for relations between the values on corresponding arms.
+  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
+    if (A->getCondition() == SB->getCondition())
+      return related(A->getTrueValue(), SB->getTrueValue()) ||
+             related(A->getFalseValue(), SB->getFalseValue());
+
+  // Check both arms of the Select node individually.
+  return related(A->getTrueValue(), B) ||
+         related(A->getFalseValue(), B);
+}
+
+bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
+                                    const Value *B) {
+  // If the values are PHIs in the same block, we can do a more precise as well
+  // as efficient check: just check for relations between the values on
+  // corresponding edges.
+  if (const PHINode *PNB = dyn_cast<PHINode>(B))
+    if (PNB->getParent() == A->getParent()) {
+      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
+        if (related(A->getIncomingValue(i),
+                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
+          return true;
+      return false;
+    }
+
+  // Check each unique source of the PHI node against B.
+  SmallPtrSet<const Value *, 4> UniqueSrc;
+  for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
+    const Value *PV1 = A->getIncomingValue(i);
+    if (UniqueSrc.insert(PV1) && related(PV1, B))
+      return true;
+  }
+
+  // All of the arms checked out.
+  return false;
+}
+
+/// Test if the value of P, or any value covered by its provenance, is ever
+/// stored within the function (not counting callees).
+static bool IsStoredObjCPointer(const Value *P) {
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Worklist;
+  Worklist.push_back(P);
+  Visited.insert(P);
+  do {
+    P = Worklist.pop_back_val();
+    for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end();
+         UI != UE; ++UI) {
+      const User *Ur = *UI;
+      if (isa<StoreInst>(Ur)) {
+        if (UI.getOperandNo() == 0)
+          // The pointer is stored.
+          return true;
+        // The pointed is stored through.
+        continue;
+      }
+      if (isa<CallInst>(Ur))
+        // The pointer is passed as an argument, ignore this.
+        continue;
+      if (isa<PtrToIntInst>(P))
+        // Assume the worst.
+        return true;
+      if (Visited.insert(Ur))
+        Worklist.push_back(Ur);
+    }
+  } while (!Worklist.empty());
+
+  // Everything checked out.
+  return false;
+}
+
+bool ProvenanceAnalysis::relatedCheck(const Value *A,
+                                      const Value *B) {
+  // Skip past provenance pass-throughs.
+  A = GetUnderlyingObjCPtr(A);
+  B = GetUnderlyingObjCPtr(B);
+
+  // Quick check.
+  if (A == B)
+    return true;
+
+  // Ask regular AliasAnalysis, for a first approximation.
+  switch (AA->alias(A, B)) {
+  case AliasAnalysis::NoAlias:
+    return false;
+  case AliasAnalysis::MustAlias:
+  case AliasAnalysis::PartialAlias:
+    return true;
+  case AliasAnalysis::MayAlias:
+    break;
+  }
+
+  bool AIsIdentified = IsObjCIdentifiedObject(A);
+  bool BIsIdentified = IsObjCIdentifiedObject(B);
+
+  // An ObjC-Identified object can't alias a load if it is never locally stored.
+  if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return IsStoredObjCPointer(A);
+    if (BIsIdentified) {
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return IsStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
+    }
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
+      return IsStoredObjCPointer(B);
+  }
+
+   // Special handling for PHI and Select.
+  if (const PHINode *PN = dyn_cast<PHINode>(A))
+    return relatedPHI(PN, B);
+  if (const PHINode *PN = dyn_cast<PHINode>(B))
+    return relatedPHI(PN, A);
+  if (const SelectInst *S = dyn_cast<SelectInst>(A))
+    return relatedSelect(S, B);
+  if (const SelectInst *S = dyn_cast<SelectInst>(B))
+    return relatedSelect(S, A);
+
+  // Conservative.
+  return true;
+}
+
+bool ProvenanceAnalysis::related(const Value *A,
+                                 const Value *B) {
+  // Begin by inserting a conservative value into the map. If the insertion
+  // fails, we have the answer already. If it succeeds, leave it there until we
+  // compute the real answer to guard against recursive queries.
+  if (A > B) std::swap(A, B);
+  std::pair<CachedResultsTy::iterator, bool> Pair =
+    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  bool Result = relatedCheck(A, B);
+  CachedResults[ValuePairTy(A, B)] = Result;
+  return Result;
+}
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
new file mode 100644
index 0000000..ec449fd
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -0,0 +1,80 @@
+//===- ProvenanceAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+  class Value;
+  class AliasAnalysis;
+  class PHINode;
+  class SelectInst;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief This is similar to BasicAliasAnalysis, and it uses many of the same
+/// techniques, except it uses special ObjC-specific reasoning about pointer
+/// relationships.
+///
+/// In this context ``Provenance'' is defined as the history of an object's
+/// ownership. Thus ``Provenance Analysis'' is defined by using the notion of
+/// an ``independent provenance source'' of a pointer to determine whether or
+/// not two pointers have the same provenance source and thus could
+/// potentially be related.
+class ProvenanceAnalysis {
+  AliasAnalysis *AA;
+
+  typedef std::pair<const Value *, const Value *> ValuePairTy;
+  typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
+  CachedResultsTy CachedResults;
+
+  bool relatedCheck(const Value *A, const Value *B);
+  bool relatedSelect(const SelectInst *A, const Value *B);
+  bool relatedPHI(const PHINode *A, const Value *B);
+
+  void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+  ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+
+public:
+  ProvenanceAnalysis() {}
+
+  void setAA(AliasAnalysis *aa) { AA = aa; }
+
+  AliasAnalysis *getAA() const { return AA; }
+
+  bool related(const Value *A, const Value *B);
+
+  void clear() {
+    CachedResults.clear();
+  }
+};
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index b3fc6e3..fd55e08 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -21,7 +21,6 @@ add_llvm_library(LLVMScalarOpts
   LoopUnswitch.cpp
   LowerAtomic.cpp
   MemCpyOptimizer.cpp
-  ObjCARC.cpp
   Reassociate.cpp
   Reg2Mem.cpp
   SCCP.cpp
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index d513c96..d71dd5d 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -729,9 +729,9 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
   // It's not safe to eliminate the sign / zero extension of the return value.
   // See llvm::isInTailCallPosition().
   const Function *F = BB->getParent();
-  Attribute CallerRetAttr = F->getAttributes().getRetAttributes();
-  if (CallerRetAttr.hasAttribute(Attribute::ZExt) ||
-      CallerRetAttr.hasAttribute(Attribute::SExt))
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
     return false;
 
   // Make sure there are no instructions between the PHI and return, or that the
@@ -788,10 +788,10 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
 
     // Conservatively require the attributes of the call to match those of the
     // return. Ignore noalias because it doesn't affect the call sequence.
-    Attribute CalleeRetAttr = CS.getAttributes().getRetAttributes();
-    if (AttrBuilder(CalleeRetAttr).
+    AttributeSet CalleeAttrs = CS.getAttributes();
+    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
           removeAttribute(Attribute::NoAlias) !=
-        AttrBuilder(CallerRetAttr).
+        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
           removeAttribute(Attribute::NoAlias))
       continue;
 
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 4c3631b..995782e 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -21,6 +21,8 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -97,12 +99,29 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
     Value *Incoming = P->getIncomingValue(i);
     if (isa<Constant>(Incoming)) continue;
 
-    Constant *C = LVI->getConstantOnEdge(P->getIncomingValue(i),
-                                         P->getIncomingBlock(i),
-                                         BB);
-    if (!C) continue;
+    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB);
 
-    P->setIncomingValue(i, C);
+    // Look if the incoming value is a select with a constant but LVI tells us
+    // that the incoming value can never be that constant. In that case replace
+    // the incoming value with the other value of the select. This often allows
+    // us to remove the select later.
+    if (!V) {
+      SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+      if (!SI) continue;
+
+      Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+      if (!C) continue;
+
+      if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+                                  P->getIncomingBlock(i), BB) !=
+          LazyValueInfo::False)
+        continue;
+
+      DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+      V = SI->getTrueValue();
+    }
+
+    P->setIncomingValue(i, V);
     Changed = true;
   }
 
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index fe3acbf..57432c7 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -376,10 +376,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval argument).  If so, then it clearly overwrites any
   // other store to the same object.
-  const DataLayout &TD = *AA.getDataLayout();
+  const DataLayout *TD = AA.getDataLayout();
 
-  const Value *UO1 = GetUnderlyingObject(P1, &TD),
-              *UO2 = GetUnderlyingObject(P2, &TD);
+  const Value *UO1 = GetUnderlyingObject(P1, TD),
+              *UO2 = GetUnderlyingObject(P2, TD);
 
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 14201b9..c04b447 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -849,8 +849,8 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
     return -1;
 
   int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD);
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD);
+  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&TD);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &TD);
   if (StoreBase != LoadBase)
     return -1;
 
@@ -945,7 +945,7 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
   // then we should widen it!
   int64_t LoadOffs = 0;
   const Value *LoadBase =
-    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD);
+    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &TD);
   unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
 
   unsigned Size = MemoryDependenceAnalysis::
@@ -1526,10 +1526,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
 
-  bool isSinglePred = false;
   bool allSingleSucc = true;
   while (TmpBB->getSinglePredecessor()) {
-    isSinglePred = true;
     TmpBB = TmpBB->getSinglePredecessor();
     if (TmpBB == LoadBB) // Infinite (unreachable) loop.
       return false;
@@ -1548,28 +1546,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   assert(TmpBB);
   LoadBB = TmpBB;
 
-  // FIXME: It is extremely unclear what this loop is doing, other than
-  // artificially restricting loadpre.
-  if (isSinglePred) {
-    bool isHot = false;
-    for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
-      const AvailableValueInBlock &AV = ValuesPerBlock[i];
-      if (AV.isSimpleValue())
-        // "Hot" Instruction is in some loop (because it dominates its dep.
-        // instruction).
-        if (Instruction *I = dyn_cast<Instruction>(AV.getSimpleValue()))
-          if (DT->dominates(LI, I)) {
-            isHot = true;
-            break;
-          }
-    }
-
-    // We are interested only in "hot" instructions. We don't want to do any
-    // mis-optimizations here.
-    if (!isHot)
-      return false;
-  }
-
   // Check to see how many predecessors have the loaded value fully
   // available.
   DenseMap<BasicBlock*, Value*> PredLoads;
@@ -2371,8 +2347,8 @@ bool GVN::processBlock(BasicBlock *BB) {
          E = InstrsToErase.end(); I != E; ++I) {
       DEBUG(dbgs() << "GVN removed: " << **I << '\n');
       if (MD) MD->removeInstruction(*I);
-      (*I)->eraseFromParent();
       DEBUG(verifyRemoved(*I));
+      (*I)->eraseFromParent();
     }
     InstrsToErase.clear();
 
@@ -2389,7 +2365,7 @@ bool GVN::processBlock(BasicBlock *BB) {
 /// control flow patterns and attempts to perform simple PRE at the join point.
 bool GVN::performPRE(Function &F) {
   bool Changed = false;
-  DenseMap<BasicBlock*, Value*> predMap;
+  SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
   for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
        DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
     BasicBlock *CurrentBlock = *DI;
@@ -2445,19 +2421,22 @@ bool GVN::performPRE(Function &F) {
         if (P == CurrentBlock) {
           NumWithout = 2;
           break;
-        } else if (!DT->dominates(&F.getEntryBlock(), P))  {
+        } else if (!DT->isReachableFromEntry(P))  {
           NumWithout = 2;
           break;
         }
 
         Value* predV = findLeader(P, ValNo);
         if (predV == 0) {
+          predMap.push_back(std::make_pair(static_cast<Value *>(0), P));
           PREPred = P;
           ++NumWithout;
         } else if (predV == CurInst) {
+          /* CurInst dominates this predecessor. */
           NumWithout = 2;
+          break;
         } else {
-          predMap[P] = predV;
+          predMap.push_back(std::make_pair(predV, P));
           ++NumWith;
         }
       }
@@ -2504,15 +2483,14 @@ bool GVN::performPRE(Function &F) {
       // the PRE predecessor.  This is typically because of loads which
       // are not value numbered precisely.
       if (!success) {
-        delete PREInstr;
         DEBUG(verifyRemoved(PREInstr));
+        delete PREInstr;
         continue;
       }
 
       PREInstr->insertBefore(PREPred->getTerminator());
       PREInstr->setName(CurInst->getName() + ".pre");
       PREInstr->setDebugLoc(CurInst->getDebugLoc());
-      predMap[PREPred] = PREInstr;
       VN.add(PREInstr, ValNo);
       ++NumGVNPRE;
 
@@ -2520,13 +2498,14 @@ bool GVN::performPRE(Function &F) {
       addToLeaderTable(ValNo, PREInstr, PREPred);
 
       // Create a PHI to make the value available in this block.
-      pred_iterator PB = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock);
-      PHINode* Phi = PHINode::Create(CurInst->getType(), std::distance(PB, PE),
+      PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(),
                                      CurInst->getName() + ".pre-phi",
                                      CurrentBlock->begin());
-      for (pred_iterator PI = PB; PI != PE; ++PI) {
-        BasicBlock *P = *PI;
-        Phi->addIncoming(predMap[P], P);
+      for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+        if (Value *V = predMap[i].first)
+          Phi->addIncoming(V, predMap[i].second);
+        else
+          Phi->addIncoming(PREInstr, PREPred);
       }
 
       VN.add(Phi, ValNo);
@@ -2551,8 +2530,8 @@ bool GVN::performPRE(Function &F) {
 
       DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
       if (MD) MD->removeInstruction(CurInst);
-      CurInst->eraseFromParent();
       DEBUG(verifyRemoved(CurInst));
+      CurInst->eraseFromParent();
       Changed = true;
     }
   }
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index dc6bef7..f94cd2a 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -440,13 +440,12 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
   }
 
   // Only these instructions are hoistable/sinkable.
-  bool HoistableKind = (isa<BinaryOperator>(I) || isa<CastInst>(I) ||
-                            isa<SelectInst>(I) || isa<GetElementPtrInst>(I) ||
-                            isa<CmpInst>(I)    || isa<InsertElementInst>(I) ||
-                            isa<ExtractElementInst>(I) ||
-                            isa<ShuffleVectorInst>(I));
-  if (!HoistableKind)
-      return false;
+  if (!isa<BinaryOperator>(I) && !isa<CastInst>(I) && !isa<SelectInst>(I) &&
+      !isa<GetElementPtrInst>(I) && !isa<CmpInst>(I) &&
+      !isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+      !isa<ShuffleVectorInst>(I) && !isa<ExtractValueInst>(I) &&
+      !isa<InsertValueInst>(I))
+    return false;
 
   return isSafeToExecuteUnconditionally(I);
 }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index c4f9012..8258719 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -407,7 +407,7 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
 
   // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
   {
-    if (DefX2->getOpcode() != Instruction::And)
+    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
       return false;
 
     BinaryOperator *SubOneOp;
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index c48808f..a23860a 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -14,6 +14,7 @@
 #define DEBUG_TYPE "loop-instsimplify"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 0ea80f3..e98ae95 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -51,6 +52,7 @@ namespace {
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
@@ -59,11 +61,13 @@ namespace {
 
   private:
     LoopInfo *LI;
+    const TargetTransformInfo *TTI;
   };
 }
 
 char LoopRotate::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -75,6 +79,7 @@ Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
 /// the loop is rotated at least once.
 bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfo>();
+  TTI = &getAnalysis<TargetTransformInfo>();
 
   // Simplify the loop latch before attempting to rotate the header
   // upward. Rotation may not be needed if the loop tail can be folded into the
@@ -278,7 +283,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
   // duplicate blocks inside it.
   {
     CodeMetrics Metrics;
-    Metrics.analyzeBasicBlock(OrigHeader);
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI);
     if (Metrics.notDuplicatable) {
       DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non duplicatable"
             << " instructions: "; L->dump());
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index c7b853e..4e4cb86 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -58,6 +58,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -237,7 +238,7 @@ struct Formula {
 
   /// BaseRegs - The list of "base" registers for this use. When this is
   /// non-empty,
-  SmallVector<const SCEV *, 2> BaseRegs;
+  SmallVector<const SCEV *, 4> BaseRegs;
 
   /// ScaledReg - The 'scaled' register for this use. This should be non-null
   /// when Scale is not zero.
@@ -1087,19 +1088,19 @@ namespace {
 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
 struct UniquifierDenseMapInfo {
-  static SmallVector<const SCEV *, 2> getEmptyKey() {
-    SmallVector<const SCEV *, 2> V;
+  static SmallVector<const SCEV *, 4> getEmptyKey() {
+    SmallVector<const SCEV *, 4>  V;
     V.push_back(reinterpret_cast<const SCEV *>(-1));
     return V;
   }
 
-  static SmallVector<const SCEV *, 2> getTombstoneKey() {
-    SmallVector<const SCEV *, 2> V;
+  static SmallVector<const SCEV *, 4> getTombstoneKey() {
+    SmallVector<const SCEV *, 4> V;
     V.push_back(reinterpret_cast<const SCEV *>(-2));
     return V;
   }
 
-  static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
+  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
     unsigned Result = 0;
     for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
          E = V.end(); I != E; ++I)
@@ -1107,8 +1108,8 @@ struct UniquifierDenseMapInfo {
     return Result;
   }
 
-  static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
-                      const SmallVector<const SCEV *, 2> &RHS) {
+  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+                      const SmallVector<const SCEV *, 4> &RHS) {
     return LHS == RHS;
   }
 };
@@ -1119,7 +1120,7 @@ struct UniquifierDenseMapInfo {
 /// the user itself, and information about how the use may be satisfied.
 /// TODO: Represent multiple users of the same expression in common?
 class LSRUse {
-  DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
+  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
 
 public:
   /// KindType - An enum for a kind of use, indicating what types of
@@ -1178,7 +1179,7 @@ public:
 /// HasFormula - Test whether this use as a formula which has the same
 /// registers as the given formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
-  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
   std::sort(Key.begin(), Key.end());
@@ -1188,7 +1189,7 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
-  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
   std::sort(Key.begin(), Key.end());
@@ -2536,6 +2537,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
     // Add this IV user to the end of the chain.
     IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
   }
+  IVChain &Chain = IVChainVec[ChainIdx];
 
   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
   // This chain's NearUsers become FarUsers.
@@ -2553,8 +2555,19 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
   for (Value::use_iterator UseIter = IVOper->use_begin(),
          UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) {
     Instruction *OtherUse = dyn_cast<Instruction>(*UseIter);
-    if (!OtherUse || OtherUse == UserInst)
+    if (!OtherUse)
       continue;
+    // Uses in the chain will no longer be uses if the chain is formed.
+    // Include the head of the chain in this iteration (not Chain.begin()).
+    IVChain::const_iterator IncIter = Chain.Incs.begin();
+    IVChain::const_iterator IncEnd = Chain.Incs.end();
+    for( ; IncIter != IncEnd; ++IncIter) {
+      if (IncIter->UserInst == OtherUse)
+        break;
+    }
+    if (IncIter != IncEnd)
+      continue;
+
     if (SE.isSCEVable(OtherUse->getType())
         && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
         && IU.isIVUserOrOperand(OtherUse)) {
@@ -2891,7 +2904,6 @@ void
 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
   Formula F;
   F.InitialMatch(S, L, SE);
-  F.HasBaseReg = true;
   bool Inserted = InsertFormula(LU, LUIdx, F);
   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
 }
@@ -2903,6 +2915,7 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S,
                                        LSRUse &LU, size_t LUIdx) {
   Formula F;
   F.BaseRegs.push_back(S);
+  F.HasBaseReg = true;
   bool Inserted = InsertFormula(LU, LUIdx, F);
   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
 }
@@ -3656,7 +3669,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 
   // Collect the best formula for each unique set of shared registers. This
   // is reset for each use.
-  typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
+  typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
     BestFormulaeTy;
   BestFormulaeTy BestFormulae;
 
@@ -3691,7 +3704,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
               dbgs() << "\n");
       }
       else {
-        SmallVector<const SCEV *, 2> Key;
+        SmallVector<const SCEV *, 4> Key;
         for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
                JE = F.BaseRegs.end(); J != JE; ++J) {
           const SCEV *Reg = *J;
@@ -3837,83 +3850,83 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
 /// for expressions like A, A+1, A+2, etc., allocate a single register for
 /// them.
 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
-  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
-    DEBUG(dbgs() << "The search space is too complex.\n");
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
 
-    DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
-                    "separated by a constant offset will use the same "
-                    "registers.\n");
+  DEBUG(dbgs() << "The search space is too complex.\n"
+                  "Narrowing the search space by assuming that uses separated "
+                  "by a constant offset will use the same registers.\n");
 
-    // This is especially useful for unrolled loops.
+  // This is especially useful for unrolled loops.
 
-    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
-      LSRUse &LU = Uses[LUIdx];
-      for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
-           E = LU.Formulae.end(); I != E; ++I) {
-        const Formula &F = *I;
-        if (F.BaseOffset != 0 && F.Scale == 0) {
-          if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
-            if (reconcileNewOffset(*LUThatHas, F.BaseOffset,
-                                   /*HasBaseReg=*/false,
-                                   LU.Kind, LU.AccessTy)) {
-              DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
-                    dbgs() << '\n');
-
-              LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
-
-              // Update the relocs to reference the new use.
-              for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
-                   E = Fixups.end(); I != E; ++I) {
-                LSRFixup &Fixup = *I;
-                if (Fixup.LUIdx == LUIdx) {
-                  Fixup.LUIdx = LUThatHas - &Uses.front();
-                  Fixup.Offset += F.BaseOffset;
-                  // Add the new offset to LUThatHas' offset list.
-                  if (LUThatHas->Offsets.back() != Fixup.Offset) {
-                    LUThatHas->Offsets.push_back(Fixup.Offset);
-                    if (Fixup.Offset > LUThatHas->MaxOffset)
-                      LUThatHas->MaxOffset = Fixup.Offset;
-                    if (Fixup.Offset < LUThatHas->MinOffset)
-                      LUThatHas->MinOffset = Fixup.Offset;
-                  }
-                  DEBUG(dbgs() << "New fixup has offset "
-                               << Fixup.Offset << '\n');
-                }
-                if (Fixup.LUIdx == NumUses-1)
-                  Fixup.LUIdx = LUIdx;
-              }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
+         E = LU.Formulae.end(); I != E; ++I) {
+      const Formula &F = *I;
+      if (F.BaseOffset == 0 || F.Scale != 0)
+        continue;
 
-              // Delete formulae from the new use which are no longer legal.
-              bool Any = false;
-              for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
-                Formula &F = LUThatHas->Formulae[i];
-                if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
-                                LUThatHas->Kind, LUThatHas->AccessTy, F)) {
-                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
-                        dbgs() << '\n');
-                  LUThatHas->DeleteFormula(F);
-                  --i;
-                  --e;
-                  Any = true;
-                }
-              }
-              if (Any)
-                LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+      if (!LUThatHas)
+        continue;
 
-              // Delete the old use.
-              DeleteUse(LU, LUIdx);
-              --LUIdx;
-              --NumUses;
-              break;
-            }
+      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+                              LU.Kind, LU.AccessTy))
+        continue;
+
+      DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+      // Update the relocs to reference the new use.
+      for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
+           E = Fixups.end(); I != E; ++I) {
+        LSRFixup &Fixup = *I;
+        if (Fixup.LUIdx == LUIdx) {
+          Fixup.LUIdx = LUThatHas - &Uses.front();
+          Fixup.Offset += F.BaseOffset;
+          // Add the new offset to LUThatHas' offset list.
+          if (LUThatHas->Offsets.back() != Fixup.Offset) {
+            LUThatHas->Offsets.push_back(Fixup.Offset);
+            if (Fixup.Offset > LUThatHas->MaxOffset)
+              LUThatHas->MaxOffset = Fixup.Offset;
+            if (Fixup.Offset < LUThatHas->MinOffset)
+              LUThatHas->MinOffset = Fixup.Offset;
           }
+          DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
         }
+        if (Fixup.LUIdx == NumUses-1)
+          Fixup.LUIdx = LUIdx;
       }
-    }
 
-    DEBUG(dbgs() << "After pre-selection:\n";
-          print_uses(dbgs()));
+      // Delete formulae from the new use which are no longer legal.
+      bool Any = false;
+      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+        Formula &F = LUThatHas->Formulae[i];
+        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+                        LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+          DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                dbgs() << '\n');
+          LUThatHas->DeleteFormula(F);
+          --i;
+          --e;
+          Any = true;
+        }
+      }
+
+      if (Any)
+        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+      // Delete the old use.
+      DeleteUse(LU, LUIdx);
+      --LUIdx;
+      --NumUses;
+      break;
+    }
   }
+
+  DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
 }
 
 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index e0f915b..80d060b 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/CommandLine.h"
@@ -90,6 +91,7 @@ namespace {
       AU.addPreservedID(LCSSAID);
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
       // loop will receive invalid dom info.
@@ -101,6 +103,7 @@ namespace {
 
 char LoopUnroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -113,11 +116,12 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                                    bool &NotDuplicatable, const DataLayout *TD) {
+                                    bool &NotDuplicatable,
+                                    const TargetTransformInfo &TTI) {
   CodeMetrics Metrics;
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
-    Metrics.analyzeBasicBlock(*I, TD);
+    Metrics.analyzeBasicBlock(*I, TTI);
   NumCalls = Metrics.NumInlineCandidates;
   NotDuplicatable = Metrics.notDuplicatable;
 
@@ -134,6 +138,7 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   LoopInfo *LI = &getAnalysis<LoopInfo>();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+  const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
@@ -181,11 +186,10 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Enforce the threshold.
   if (Threshold != NoThreshold) {
-    const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
     unsigned NumInlineCandidates;
     bool notDuplicatable;
     unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates,
-                                            notDuplicatable, TD);
+                                            notDuplicatable, TTI);
     DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
     if (notDuplicatable) {
       DEBUG(dbgs() << "  Not unrolling loop which contains non duplicatable"
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 68d4423..0e8199f 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -101,7 +102,7 @@ namespace {
 
       // Analyze loop. Check its size, calculate is it possible to unswitch
       // it. Returns true if we can unswitch this loop.
-      bool countLoop(const Loop* L);
+      bool countLoop(const Loop* L, const TargetTransformInfo &TTI);
 
       // Clean all data related to given loop.
       void forgetLoop(const Loop* L);
@@ -170,6 +171,7 @@ namespace {
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
     }
 
   private:
@@ -221,7 +223,7 @@ namespace {
 
 // Analyze loop. Check its size, calculate is it possible to unswitch
 // it. Returns true if we can unswitch this loop.
-bool LUAnalysisCache::countLoop(const Loop* L) {
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 
   std::pair<LoopPropsMapIt, bool> InsertRes =
       LoopsProperties.insert(std::make_pair(L, LoopProperties()));
@@ -243,7 +245,7 @@ bool LUAnalysisCache::countLoop(const Loop* L) {
     for (Loop::block_iterator I = L->block_begin(),
            E = L->block_end();
          I != E; ++I)
-      Metrics.analyzeBasicBlock(*I);
+      Metrics.analyzeBasicBlock(*I, TTI);
 
     Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5);
     Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
@@ -334,6 +336,7 @@ void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
 char LoopUnswitch::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -424,7 +427,7 @@ bool LoopUnswitch::processCurrentLoop() {
 
   // Probably we reach the quota of branches for this loop. If so
   // stop unswitching.
-  if (!BranchesInfo.countLoop(currentLoop))
+  if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>()))
     return false;
 
   // Loop over all of the basic blocks in the loop.  If we find an interior
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
deleted file mode 100644
index e6ec841..0000000
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ /dev/null
@@ -1,4354 +0,0 @@
-//===- ObjCARC.cpp - ObjC ARC Optimization --------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines ObjC ARC optimizations. ARC stands for
-// Automatic Reference Counting and is a system for managing reference counts
-// for objects in Objective C.
-//
-// The optimizations performed include elimination of redundant, partially
-// redundant, and inconsequential reference count operations, elimination of
-// redundant weak pointer operations, pattern-matching and replacement of
-// low-level operations into higher-level operations, and numerous minor
-// simplifications.
-//
-// This file also defines a simple ARC-aware AliasAnalysis.
-//
-// WARNING: This file knows about certain library functions. It recognizes them
-// by name, and hardwires knowledge of their semantics.
-//
-// WARNING: This file knows about how certain Objective-C library functions are
-// used. Naive LLVM IR transformations which would otherwise be
-// behavior-preserving may break these assumptions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "objc-arc"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-// A handy option to enable/disable all optimizations in this file.
-static cl::opt<bool> EnableARCOpts("enable-objc-arc-opts", cl::init(true));
-
-//===----------------------------------------------------------------------===//
-// Misc. Utilities
-//===----------------------------------------------------------------------===//
-
-namespace {
-  /// MapVector - An associative container with fast insertion-order
-  /// (deterministic) iteration over its elements. Plus the special
-  /// blot operation.
-  template<class KeyT, class ValueT>
-  class MapVector {
-    /// Map - Map keys to indices in Vector.
-    typedef DenseMap<KeyT, size_t> MapTy;
-    MapTy Map;
-
-    /// Vector - Keys and values.
-    typedef std::vector<std::pair<KeyT, ValueT> > VectorTy;
-    VectorTy Vector;
-
-  public:
-    typedef typename VectorTy::iterator iterator;
-    typedef typename VectorTy::const_iterator const_iterator;
-    iterator begin() { return Vector.begin(); }
-    iterator end() { return Vector.end(); }
-    const_iterator begin() const { return Vector.begin(); }
-    const_iterator end() const { return Vector.end(); }
-
-#ifdef XDEBUG
-    ~MapVector() {
-      assert(Vector.size() >= Map.size()); // May differ due to blotting.
-      for (typename MapTy::const_iterator I = Map.begin(), E = Map.end();
-           I != E; ++I) {
-        assert(I->second < Vector.size());
-        assert(Vector[I->second].first == I->first);
-      }
-      for (typename VectorTy::const_iterator I = Vector.begin(),
-           E = Vector.end(); I != E; ++I)
-        assert(!I->first ||
-               (Map.count(I->first) &&
-                Map[I->first] == size_t(I - Vector.begin())));
-    }
-#endif
-
-    ValueT &operator[](const KeyT &Arg) {
-      std::pair<typename MapTy::iterator, bool> Pair =
-        Map.insert(std::make_pair(Arg, size_t(0)));
-      if (Pair.second) {
-        size_t Num = Vector.size();
-        Pair.first->second = Num;
-        Vector.push_back(std::make_pair(Arg, ValueT()));
-        return Vector[Num].second;
-      }
-      return Vector[Pair.first->second].second;
-    }
-
-    std::pair<iterator, bool>
-    insert(const std::pair<KeyT, ValueT> &InsertPair) {
-      std::pair<typename MapTy::iterator, bool> Pair =
-        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
-      if (Pair.second) {
-        size_t Num = Vector.size();
-        Pair.first->second = Num;
-        Vector.push_back(InsertPair);
-        return std::make_pair(Vector.begin() + Num, true);
-      }
-      return std::make_pair(Vector.begin() + Pair.first->second, false);
-    }
-
-    const_iterator find(const KeyT &Key) const {
-      typename MapTy::const_iterator It = Map.find(Key);
-      if (It == Map.end()) return Vector.end();
-      return Vector.begin() + It->second;
-    }
-
-    /// blot - This is similar to erase, but instead of removing the element
-    /// from the vector, it just zeros out the key in the vector. This leaves
-    /// iterators intact, but clients must be prepared for zeroed-out keys when
-    /// iterating.
-    void blot(const KeyT &Key) {
-      typename MapTy::iterator It = Map.find(Key);
-      if (It == Map.end()) return;
-      Vector[It->second].first = KeyT();
-      Map.erase(It);
-    }
-
-    void clear() {
-      Map.clear();
-      Vector.clear();
-    }
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// ARC Utilities.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-namespace {
-  /// InstructionClass - A simple classification for instructions.
-  enum InstructionClass {
-    IC_Retain,              ///< objc_retain
-    IC_RetainRV,            ///< objc_retainAutoreleasedReturnValue
-    IC_RetainBlock,         ///< objc_retainBlock
-    IC_Release,             ///< objc_release
-    IC_Autorelease,         ///< objc_autorelease
-    IC_AutoreleaseRV,       ///< objc_autoreleaseReturnValue
-    IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush
-    IC_AutoreleasepoolPop,  ///< objc_autoreleasePoolPop
-    IC_NoopCast,            ///< objc_retainedObject, etc.
-    IC_FusedRetainAutorelease, ///< objc_retainAutorelease
-    IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
-    IC_LoadWeakRetained,    ///< objc_loadWeakRetained (primitive)
-    IC_StoreWeak,           ///< objc_storeWeak (primitive)
-    IC_InitWeak,            ///< objc_initWeak (derived)
-    IC_LoadWeak,            ///< objc_loadWeak (derived)
-    IC_MoveWeak,            ///< objc_moveWeak (derived)
-    IC_CopyWeak,            ///< objc_copyWeak (derived)
-    IC_DestroyWeak,         ///< objc_destroyWeak (derived)
-    IC_StoreStrong,         ///< objc_storeStrong (derived)
-    IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
-    IC_Call,                ///< could call objc_release
-    IC_User,                ///< could "use" a pointer
-    IC_None                 ///< anything else
-  };
-}
-
-/// IsPotentialUse - Test whether the given value is possible a
-/// reference-counted pointer.
-static bool IsPotentialUse(const Value *Op) {
-  // Pointers to static or stack storage are not reference-counted pointers.
-  if (isa<Constant>(Op) || isa<AllocaInst>(Op))
-    return false;
-  // Special arguments are not reference-counted.
-  if (const Argument *Arg = dyn_cast<Argument>(Op))
-    if (Arg->hasByValAttr() ||
-        Arg->hasNestAttr() ||
-        Arg->hasStructRetAttr())
-      return false;
-  // Only consider values with pointer types.
-  // It seemes intuitive to exclude function pointer types as well, since
-  // functions are never reference-counted, however clang occasionally
-  // bitcasts reference-counted pointers to function-pointer type
-  // temporarily.
-  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
-  if (!Ty)
-    return false;
-  // Conservatively assume anything else is a potential use.
-  return true;
-}
-
-/// GetCallSiteClass - Helper for GetInstructionClass. Determines what kind
-/// of construct CS is.
-static InstructionClass GetCallSiteClass(ImmutableCallSite CS) {
-  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
-       I != E; ++I)
-    if (IsPotentialUse(*I))
-      return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser;
-
-  return CS.onlyReadsMemory() ? IC_None : IC_Call;
-}
-
-/// GetFunctionClass - Determine if F is one of the special known Functions.
-/// If it isn't, return IC_CallOrUser.
-static InstructionClass GetFunctionClass(const Function *F) {
-  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-
-  // No arguments.
-  if (AI == AE)
-    return StringSwitch<InstructionClass>(F->getName())
-      .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
-      .Default(IC_CallOrUser);
-
-  // One argument.
-  const Argument *A0 = AI++;
-  if (AI == AE)
-    // Argument is a pointer.
-    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
-      Type *ETy = PTy->getElementType();
-      // Argument is i8*.
-      if (ETy->isIntegerTy(8))
-        return StringSwitch<InstructionClass>(F->getName())
-          .Case("objc_retain",                IC_Retain)
-          .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV)
-          .Case("objc_retainBlock",           IC_RetainBlock)
-          .Case("objc_release",               IC_Release)
-          .Case("objc_autorelease",           IC_Autorelease)
-          .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV)
-          .Case("objc_autoreleasePoolPop",    IC_AutoreleasepoolPop)
-          .Case("objc_retainedObject",        IC_NoopCast)
-          .Case("objc_unretainedObject",      IC_NoopCast)
-          .Case("objc_unretainedPointer",     IC_NoopCast)
-          .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
-          .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
-          .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
-          .Default(IC_CallOrUser);
-
-      // Argument is i8**
-      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
-        if (Pte->getElementType()->isIntegerTy(8))
-          return StringSwitch<InstructionClass>(F->getName())
-            .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
-            .Case("objc_loadWeak",              IC_LoadWeak)
-            .Case("objc_destroyWeak",           IC_DestroyWeak)
-            .Default(IC_CallOrUser);
-    }
-
-  // Two arguments, first is i8**.
-  const Argument *A1 = AI++;
-  if (AI == AE)
-    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
-      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
-        if (Pte->getElementType()->isIntegerTy(8))
-          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
-            Type *ETy1 = PTy1->getElementType();
-            // Second argument is i8*
-            if (ETy1->isIntegerTy(8))
-              return StringSwitch<InstructionClass>(F->getName())
-                     .Case("objc_storeWeak",             IC_StoreWeak)
-                     .Case("objc_initWeak",              IC_InitWeak)
-                     .Case("objc_storeStrong",           IC_StoreStrong)
-                     .Default(IC_CallOrUser);
-            // Second argument is i8**.
-            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
-              if (Pte1->getElementType()->isIntegerTy(8))
-                return StringSwitch<InstructionClass>(F->getName())
-                       .Case("objc_moveWeak",              IC_MoveWeak)
-                       .Case("objc_copyWeak",              IC_CopyWeak)
-                       .Default(IC_CallOrUser);
-          }
-
-  // Anything else.
-  return IC_CallOrUser;
-}
-
-/// GetInstructionClass - Determine what kind of construct V is.
-static InstructionClass GetInstructionClass(const Value *V) {
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
-    // Any instruction other than bitcast and gep with a pointer operand have a
-    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
-    // to a subsequent use, rather than using it themselves, in this sense.
-    // As a short cut, several other opcodes are known to have no pointer
-    // operands of interest. And ret is never followed by a release, so it's
-    // not interesting to examine.
-    switch (I->getOpcode()) {
-    case Instruction::Call: {
-      const CallInst *CI = cast<CallInst>(I);
-      // Check for calls to special functions.
-      if (const Function *F = CI->getCalledFunction()) {
-        InstructionClass Class = GetFunctionClass(F);
-        if (Class != IC_CallOrUser)
-          return Class;
-
-        // None of the intrinsic functions do objc_release. For intrinsics, the
-        // only question is whether or not they may be users.
-        switch (F->getIntrinsicID()) {
-        case Intrinsic::returnaddress: case Intrinsic::frameaddress:
-        case Intrinsic::stacksave: case Intrinsic::stackrestore:
-        case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
-        case Intrinsic::objectsize: case Intrinsic::prefetch:
-        case Intrinsic::stackprotector:
-        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
-        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
-        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
-        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
-        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
-        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
-        // Don't let dbg info affect our results.
-        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
-          // Short cut: Some intrinsics obviously don't use ObjC pointers.
-          return IC_None;
-        default:
-          break;
-        }
-      }
-      return GetCallSiteClass(CI);
-    }
-    case Instruction::Invoke:
-      return GetCallSiteClass(cast<InvokeInst>(I));
-    case Instruction::BitCast:
-    case Instruction::GetElementPtr:
-    case Instruction::Select: case Instruction::PHI:
-    case Instruction::Ret: case Instruction::Br:
-    case Instruction::Switch: case Instruction::IndirectBr:
-    case Instruction::Alloca: case Instruction::VAArg:
-    case Instruction::Add: case Instruction::FAdd:
-    case Instruction::Sub: case Instruction::FSub:
-    case Instruction::Mul: case Instruction::FMul:
-    case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv:
-    case Instruction::SRem: case Instruction::URem: case Instruction::FRem:
-    case Instruction::Shl: case Instruction::LShr: case Instruction::AShr:
-    case Instruction::And: case Instruction::Or: case Instruction::Xor:
-    case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc:
-    case Instruction::IntToPtr: case Instruction::FCmp:
-    case Instruction::FPTrunc: case Instruction::FPExt:
-    case Instruction::FPToUI: case Instruction::FPToSI:
-    case Instruction::UIToFP: case Instruction::SIToFP:
-    case Instruction::InsertElement: case Instruction::ExtractElement:
-    case Instruction::ShuffleVector:
-    case Instruction::ExtractValue:
-      break;
-    case Instruction::ICmp:
-      // Comparing a pointer with null, or any other constant, isn't an
-      // interesting use, because we don't care what the pointer points to, or
-      // about the values of any other dynamic reference-counted pointers.
-      if (IsPotentialUse(I->getOperand(1)))
-        return IC_User;
-      break;
-    default:
-      // For anything else, check all the operands.
-      // Note that this includes both operands of a Store: while the first
-      // operand isn't actually being dereferenced, it is being stored to
-      // memory where we can no longer track who might read it and dereference
-      // it, so we have to consider it potentially used.
-      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
-           OI != OE; ++OI)
-        if (IsPotentialUse(*OI))
-          return IC_User;
-    }
-  }
-
-  // Otherwise, it's totally inert for ARC purposes.
-  return IC_None;
-}
-
-/// GetBasicInstructionClass - Determine what kind of construct V is. This is
-/// similar to GetInstructionClass except that it only detects objc runtine
-/// calls. This allows it to be faster.
-static InstructionClass GetBasicInstructionClass(const Value *V) {
-  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
-    if (const Function *F = CI->getCalledFunction())
-      return GetFunctionClass(F);
-    // Otherwise, be conservative.
-    return IC_CallOrUser;
-  }
-
-  // Otherwise, be conservative.
-  return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
-}
-
-/// IsRetain - Test if the given class is objc_retain or
-/// equivalent.
-static bool IsRetain(InstructionClass Class) {
-  return Class == IC_Retain ||
-         Class == IC_RetainRV;
-}
-
-/// IsAutorelease - Test if the given class is objc_autorelease or
-/// equivalent.
-static bool IsAutorelease(InstructionClass Class) {
-  return Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV;
-}
-
-/// IsForwarding - Test if the given class represents instructions which return
-/// their argument verbatim.
-static bool IsForwarding(InstructionClass Class) {
-  // objc_retainBlock technically doesn't always return its argument
-  // verbatim, but it doesn't matter for our purposes here.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_RetainBlock ||
-         Class == IC_NoopCast;
-}
-
-/// IsNoopOnNull - Test if the given class represents instructions which do
-/// nothing if passed a null pointer.
-static bool IsNoopOnNull(InstructionClass Class) {
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Release ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_RetainBlock;
-}
-
-/// IsAlwaysTail - Test if the given class represents instructions which are
-/// always safe to mark with the "tail" keyword.
-static bool IsAlwaysTail(InstructionClass Class) {
-  // IC_RetainBlock may be given a stack argument.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV;
-}
-
-/// IsNoThrow - Test if the given class represents instructions which are always
-/// safe to mark with the nounwind attribute..
-static bool IsNoThrow(InstructionClass Class) {
-  // objc_retainBlock is not nounwind because it calls user copy constructors
-  // which could theoretically throw.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Release ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_AutoreleasepoolPush ||
-         Class == IC_AutoreleasepoolPop;
-}
-
-/// EraseInstruction - Erase the given instruction. Many ObjC calls return their
-/// argument verbatim, so if it's such a call and the return value has users,
-/// replace them with the argument value.
-static void EraseInstruction(Instruction *CI) {
-  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
-
-  bool Unused = CI->use_empty();
-
-  if (!Unused) {
-    // Replace the return value with the argument.
-    assert(IsForwarding(GetBasicInstructionClass(CI)) &&
-           "Can't delete non-forwarding instruction with users!");
-    CI->replaceAllUsesWith(OldArg);
-  }
-
-  CI->eraseFromParent();
-
-  if (Unused)
-    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
-}
-
-/// GetUnderlyingObjCPtr - This is a wrapper around getUnderlyingObject which
-/// also knows how to look through objc_retain and objc_autorelease calls, which
-/// we know to return their argument verbatim.
-static const Value *GetUnderlyingObjCPtr(const Value *V) {
-  for (;;) {
-    V = GetUnderlyingObject(V);
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-
-  return V;
-}
-
-/// StripPointerCastsAndObjCCalls - This is a wrapper around
-/// Value::stripPointerCasts which also knows how to look through objc_retain
-/// and objc_autorelease calls, which we know to return their argument verbatim.
-static const Value *StripPointerCastsAndObjCCalls(const Value *V) {
-  for (;;) {
-    V = V->stripPointerCasts();
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-  return V;
-}
-
-/// StripPointerCastsAndObjCCalls - This is a wrapper around
-/// Value::stripPointerCasts which also knows how to look through objc_retain
-/// and objc_autorelease calls, which we know to return their argument verbatim.
-static Value *StripPointerCastsAndObjCCalls(Value *V) {
-  for (;;) {
-    V = V->stripPointerCasts();
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-  return V;
-}
-
-/// GetObjCArg - Assuming the given instruction is one of the special calls such
-/// as objc_retain or objc_release, return the argument value, stripped of no-op
-/// casts and forwarding calls.
-static Value *GetObjCArg(Value *Inst) {
-  return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
-}
-
-/// IsObjCIdentifiedObject - This is similar to AliasAnalysis'
-/// isObjCIdentifiedObject, except that it uses special knowledge of
-/// ObjC conventions...
-static bool IsObjCIdentifiedObject(const Value *V) {
-  // Assume that call results and arguments have their own "provenance".
-  // Constants (including GlobalVariables) and Allocas are never
-  // reference-counted.
-  if (isa<CallInst>(V) || isa<InvokeInst>(V) ||
-      isa<Argument>(V) || isa<Constant>(V) ||
-      isa<AllocaInst>(V))
-    return true;
-
-  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
-    const Value *Pointer =
-      StripPointerCastsAndObjCCalls(LI->getPointerOperand());
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
-      // A constant pointer can't be pointing to an object on the heap. It may
-      // be reference-counted, but it won't be deleted.
-      if (GV->isConstant())
-        return true;
-      StringRef Name = GV->getName();
-      // These special variables are known to hold values which are not
-      // reference-counted pointers.
-      if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") ||
-          Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") ||
-          Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") ||
-          Name.startswith("\01L_OBJC_METH_VAR_NAME_") ||
-          Name.startswith("\01l_objc_msgSend_fixup_"))
-        return true;
-    }
-  }
-
-  return false;
-}
-
-/// FindSingleUseIdentifiedObject - This is similar to
-/// StripPointerCastsAndObjCCalls but it stops as soon as it finds a value
-/// with multiple uses.
-static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
-  if (Arg->hasOneUse()) {
-    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
-      return FindSingleUseIdentifiedObject(BC->getOperand(0));
-    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
-      if (GEP->hasAllZeroIndices())
-        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
-    if (IsForwarding(GetBasicInstructionClass(Arg)))
-      return FindSingleUseIdentifiedObject(
-               cast<CallInst>(Arg)->getArgOperand(0));
-    if (!IsObjCIdentifiedObject(Arg))
-      return 0;
-    return Arg;
-  }
-
-  // If we found an identifiable object but it has multiple uses, but they are
-  // trivial uses, we can still consider this to be a single-use value.
-  if (IsObjCIdentifiedObject(Arg)) {
-    for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
-         UI != UE; ++UI) {
-      const User *U = *UI;
-      if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
-         return 0;
-    }
-
-    return Arg;
-  }
-
-  return 0;
-}
-
-/// ModuleHasARC - Test if the given module looks interesting to run ARC
-/// optimization on.
-static bool ModuleHasARC(const Module &M) {
-  return
-    M.getNamedValue("objc_retain") ||
-    M.getNamedValue("objc_release") ||
-    M.getNamedValue("objc_autorelease") ||
-    M.getNamedValue("objc_retainAutoreleasedReturnValue") ||
-    M.getNamedValue("objc_retainBlock") ||
-    M.getNamedValue("objc_autoreleaseReturnValue") ||
-    M.getNamedValue("objc_autoreleasePoolPush") ||
-    M.getNamedValue("objc_loadWeakRetained") ||
-    M.getNamedValue("objc_loadWeak") ||
-    M.getNamedValue("objc_destroyWeak") ||
-    M.getNamedValue("objc_storeWeak") ||
-    M.getNamedValue("objc_initWeak") ||
-    M.getNamedValue("objc_moveWeak") ||
-    M.getNamedValue("objc_copyWeak") ||
-    M.getNamedValue("objc_retainedObject") ||
-    M.getNamedValue("objc_unretainedObject") ||
-    M.getNamedValue("objc_unretainedPointer");
-}
-
-/// DoesObjCBlockEscape - Test whether the given pointer, which is an
-/// Objective C block pointer, does not "escape". This differs from regular
-/// escape analysis in that a use as an argument to a call is not considered
-/// an escape.
-static bool DoesObjCBlockEscape(const Value *BlockPtr) {
-  // Walk the def-use chains.
-  SmallVector<const Value *, 4> Worklist;
-  Worklist.push_back(BlockPtr);
-  do {
-    const Value *V = Worklist.pop_back_val();
-    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
-         UI != UE; ++UI) {
-      const User *UUser = *UI;
-      // Special - Use by a call (callee or argument) is not considered
-      // to be an escape.
-      switch (GetBasicInstructionClass(UUser)) {
-      case IC_StoreWeak:
-      case IC_InitWeak:
-      case IC_StoreStrong:
-      case IC_Autorelease:
-      case IC_AutoreleaseRV:
-        // These special functions make copies of their pointer arguments.
-        return true;
-      case IC_User:
-      case IC_None:
-        // Use by an instruction which copies the value is an escape if the
-        // result is an escape.
-        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
-            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
-          Worklist.push_back(UUser);
-          continue;
-        }
-        // Use by a load is not an escape.
-        if (isa<LoadInst>(UUser))
-          continue;
-        // Use by a store is not an escape if the use is the address.
-        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
-          if (V != SI->getValueOperand())
-            continue;
-        break;
-      default:
-        // Regular calls and other stuff are not considered escapes.
-        continue;
-      }
-      // Otherwise, conservatively assume an escape.
-      return true;
-    }
-  } while (!Worklist.empty());
-
-  // No escapes found.
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// ARC AliasAnalysis.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Pass.h"
-
-namespace {
-  /// ObjCARCAliasAnalysis - This is a simple alias analysis
-  /// implementation that uses knowledge of ARC constructs to answer queries.
-  ///
-  /// TODO: This class could be generalized to know about other ObjC-specific
-  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing
-  /// even though their offsets are dynamic.
-  class ObjCARCAliasAnalysis : public ImmutablePass,
-                               public AliasAnalysis {
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    ObjCARCAliasAnalysis() : ImmutablePass(ID) {
-      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry());
-    }
-
-  private:
-    virtual void initializePass() {
-      InitializeAliasAnalysis(this);
-    }
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const void *PI) {
-      if (PI == &AliasAnalysis::ID)
-        return static_cast<AliasAnalysis *>(this);
-      return this;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual AliasResult alias(const Location &LocA, const Location &LocB);
-    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
-    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
-    virtual ModRefBehavior getModRefBehavior(const Function *F);
-    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
-                                       const Location &Loc);
-    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                                       ImmutableCallSite CS2);
-  };
-}  // End of anonymous namespace
-
-// Register this pass...
-char ObjCARCAliasAnalysis::ID = 0;
-INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa",
-                   "ObjC-ARC-Based Alias Analysis", false, true, false)
-
-ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
-  return new ObjCARCAliasAnalysis();
-}
-
-void
-ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AliasAnalysis::getAnalysisUsage(AU);
-}
-
-AliasAnalysis::AliasResult
-ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::alias(LocA, LocB);
-
-  // First, strip off no-ops, including ObjC-specific no-ops, and try making a
-  // precise alias query.
-  const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
-  const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
-  AliasResult Result =
-    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag),
-                         Location(SB, LocB.Size, LocB.TBAATag));
-  if (Result != MayAlias)
-    return Result;
-
-  // If that failed, climb to the underlying object, including climbing through
-  // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *UA = GetUnderlyingObjCPtr(SA);
-  const Value *UB = GetUnderlyingObjCPtr(SB);
-  if (UA != SA || UB != SB) {
-    Result = AliasAnalysis::alias(Location(UA), Location(UB));
-    // We can't use MustAlias or PartialAlias results here because
-    // GetUnderlyingObjCPtr may return an offsetted pointer value.
-    if (Result == NoAlias)
-      return NoAlias;
-  }
-
-  // If that failed, fail. We don't need to chain here, since that's covered
-  // by the earlier precise query.
-  return MayAlias;
-}
-
-bool
-ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
-                                             bool OrLocal) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
-
-  // First, strip off no-ops, including ObjC-specific no-ops, and try making
-  // a precise alias query.
-  const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
-  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag),
-                                            OrLocal))
-    return true;
-
-  // If that failed, climb to the underlying object, including climbing through
-  // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *U = GetUnderlyingObjCPtr(S);
-  if (U != S)
-    return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal);
-
-  // If that failed, fail. We don't need to chain here, since that's covered
-  // by the earlier precise query.
-  return false;
-}
-
-AliasAnalysis::ModRefBehavior
-ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
-  // We have nothing to do. Just chain to the next AliasAnalysis.
-  return AliasAnalysis::getModRefBehavior(CS);
-}
-
-AliasAnalysis::ModRefBehavior
-ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::getModRefBehavior(F);
-
-  switch (GetFunctionClass(F)) {
-  case IC_NoopCast:
-    return DoesNotAccessMemory;
-  default:
-    break;
-  }
-
-  return AliasAnalysis::getModRefBehavior(F);
-}
-
-AliasAnalysis::ModRefResult
-ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::getModRefInfo(CS, Loc);
-
-  switch (GetBasicInstructionClass(CS.getInstruction())) {
-  case IC_Retain:
-  case IC_RetainRV:
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_NoopCast:
-  case IC_AutoreleasepoolPush:
-  case IC_FusedRetainAutorelease:
-  case IC_FusedRetainAutoreleaseRV:
-    // These functions don't access any memory visible to the compiler.
-    // Note that this doesn't include objc_retainBlock, because it updates
-    // pointers when it copies block data.
-    return NoModRef;
-  default:
-    break;
-  }
-
-  return AliasAnalysis::getModRefInfo(CS, Loc);
-}
-
-AliasAnalysis::ModRefResult
-ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
-                                    ImmutableCallSite CS2) {
-  // TODO: Theoretically we could check for dependencies between objc_* calls
-  // and OnlyAccessesArgumentPointees calls or other well-behaved calls.
-  return AliasAnalysis::getModRefInfo(CS1, CS2);
-}
-
-//===----------------------------------------------------------------------===//
-// ARC expansion.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/InstIterator.h"
-#include "llvm/Transforms/Scalar.h"
-
-namespace {
-  /// ObjCARCExpand - Early ARC transformations.
-  class ObjCARCExpand : public FunctionPass {
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-
-    /// Run - A flag indicating whether this optimization pass should run.
-    bool Run;
-
-  public:
-    static char ID;
-    ObjCARCExpand() : FunctionPass(ID) {
-      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCExpand::ID = 0;
-INITIALIZE_PASS(ObjCARCExpand,
-                "objc-arc-expand", "ObjC ARC expansion", false, false)
-
-Pass *llvm::createObjCARCExpandPass() {
-  return new ObjCARCExpand();
-}
-
-void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-}
-
-bool ObjCARCExpand::doInitialization(Module &M) {
-  Run = ModuleHasARC(M);
-  return false;
-}
-
-bool ObjCARCExpand::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  bool Changed = false;
-
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
-    Instruction *Inst = &*I;
-
-    DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
-
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_Retain:
-    case IC_RetainRV:
-    case IC_Autorelease:
-    case IC_AutoreleaseRV:
-    case IC_FusedRetainAutorelease:
-    case IC_FusedRetainAutoreleaseRV: {
-      // These calls return their argument verbatim, as a low-level
-      // optimization. However, this makes high-level optimizations
-      // harder. Undo any uses of this optimization that the front-end
-      // emitted here. We'll redo them in the contract pass.
-      Changed = true;
-      Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
-      DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n"
-                      "               New = " << *Value << "\n");
-      Inst->replaceAllUsesWith(Value);
-      break;
-    }
-    default:
-      break;
-    }
-  }
-
-  DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// ARC autorelease pool elimination.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/Constants.h"
-
-namespace {
-  /// ObjCARCAPElim - Autorelease pool elimination.
-  class ObjCARCAPElim : public ModulePass {
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool runOnModule(Module &M);
-
-    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
-    static bool OptimizeBB(BasicBlock *BB);
-
-  public:
-    static char ID;
-    ObjCARCAPElim() : ModulePass(ID) {
-      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCAPElim::ID = 0;
-INITIALIZE_PASS(ObjCARCAPElim,
-                "objc-arc-apelim",
-                "ObjC ARC autorelease pool elimination",
-                false, false)
-
-Pass *llvm::createObjCARCAPElimPass() {
-  return new ObjCARCAPElim();
-}
-
-void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-}
-
-/// MayAutorelease - Interprocedurally determine if calls made by the
-/// given call site can possibly produce autoreleases.
-bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
-  if (const Function *Callee = CS.getCalledFunction()) {
-    if (Callee->isDeclaration() || Callee->mayBeOverridden())
-      return true;
-    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
-         I != E; ++I) {
-      const BasicBlock *BB = I;
-      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
-           J != F; ++J)
-        if (ImmutableCallSite JCS = ImmutableCallSite(J))
-          // This recursion depth limit is arbitrary. It's just great
-          // enough to cover known interesting testcases.
-          if (Depth < 3 &&
-              !JCS.onlyReadsMemory() &&
-              MayAutorelease(JCS, Depth + 1))
-            return true;
-    }
-    return false;
-  }
-
-  return true;
-}
-
-bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
-  bool Changed = false;
-
-  Instruction *Push = 0;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-    Instruction *Inst = I++;
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_AutoreleasepoolPush:
-      Push = Inst;
-      break;
-    case IC_AutoreleasepoolPop:
-      // If this pop matches a push and nothing in between can autorelease,
-      // zap the pair.
-      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
-        Changed = true;
-        DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop autorelease pair:\n"
-                     << "                           Pop: " << *Inst << "\n"
-                     << "                           Push: " << *Push << "\n");
-        Inst->eraseFromParent();
-        Push->eraseFromParent();
-      }
-      Push = 0;
-      break;
-    case IC_CallOrUser:
-      if (MayAutorelease(ImmutableCallSite(Inst)))
-        Push = 0;
-      break;
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-bool ObjCARCAPElim::runOnModule(Module &M) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!ModuleHasARC(M))
-    return false;
-
-  // Find the llvm.global_ctors variable, as the first step in
-  // identifying the global constructors. In theory, unnecessary autorelease
-  // pools could occur anywhere, but in practice it's pretty rare. Global
-  // ctors are a place where autorelease pools get inserted automatically,
-  // so it's pretty common for them to be unnecessary, and it's pretty
-  // profitable to eliminate them.
-  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
-  if (!GV)
-    return false;
-
-  assert(GV->hasDefinitiveInitializer() &&
-         "llvm.global_ctors is uncooperative!");
-
-  bool Changed = false;
-
-  // Dig the constructor functions out of GV's initializer.
-  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
-  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
-       OI != OE; ++OI) {
-    Value *Op = *OI;
-    // llvm.global_ctors is an array of pairs where the second members
-    // are constructor functions.
-    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
-    // If the user used a constructor function with the wrong signature and
-    // it got bitcasted or whatever, look the other way.
-    if (!F)
-      continue;
-    // Only look at function definitions.
-    if (F->isDeclaration())
-      continue;
-    // Only look at functions with one basic block.
-    if (llvm::next(F->begin()) != F->end())
-      continue;
-    // Ok, a single-block constructor function definition. Try to optimize it.
-    Changed |= OptimizeBB(F->begin());
-  }
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// ARC optimization.
-//===----------------------------------------------------------------------===//
-
-// TODO: On code like this:
-//
-// objc_retain(%x)
-// stuff_that_cannot_release()
-// objc_autorelease(%x)
-// stuff_that_cannot_release()
-// objc_retain(%x)
-// stuff_that_cannot_release()
-// objc_autorelease(%x)
-//
-// The second retain and autorelease can be deleted.
-
-// TODO: It should be possible to delete
-// objc_autoreleasePoolPush and objc_autoreleasePoolPop
-// pairs if nothing is actually autoreleased between them. Also, autorelease
-// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
-// after inlining) can be turned into plain release calls.
-
-// TODO: Critical-edge splitting. If the optimial insertion point is
-// a critical edge, the current algorithm has to fail, because it doesn't
-// know how to split edges. It should be possible to make the optimizer
-// think in terms of edges, rather than blocks, and then split critical
-// edges on demand.
-
-// TODO: OptimizeSequences could generalized to be Interprocedural.
-
-// TODO: Recognize that a bunch of other objc runtime calls have
-// non-escaping arguments and non-releasing arguments, and may be
-// non-autoreleasing.
-
-// TODO: Sink autorelease calls as far as possible. Unfortunately we
-// usually can't sink them past other calls, which would be the main
-// case where it would be useful.
-
-// TODO: The pointer returned from objc_loadWeakRetained is retained.
-
-// TODO: Delete release+retain pairs (rare).
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/CFG.h"
-
-STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
-STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
-STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
-STATISTIC(NumRets,        "Number of return value forwarding "
-                          "retain+autoreleaes eliminated");
-STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
-STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
-
-namespace {
-  /// ProvenanceAnalysis - This is similar to BasicAliasAnalysis, and it
-  /// uses many of the same techniques, except it uses special ObjC-specific
-  /// reasoning about pointer relationships.
-  class ProvenanceAnalysis {
-    AliasAnalysis *AA;
-
-    typedef std::pair<const Value *, const Value *> ValuePairTy;
-    typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
-    CachedResultsTy CachedResults;
-
-    bool relatedCheck(const Value *A, const Value *B);
-    bool relatedSelect(const SelectInst *A, const Value *B);
-    bool relatedPHI(const PHINode *A, const Value *B);
-
-    void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
-    ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
-
-  public:
-    ProvenanceAnalysis() {}
-
-    void setAA(AliasAnalysis *aa) { AA = aa; }
-
-    AliasAnalysis *getAA() const { return AA; }
-
-    bool related(const Value *A, const Value *B);
-
-    void clear() {
-      CachedResults.clear();
-    }
-  };
-}
-
-bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) {
-  // If the values are Selects with the same condition, we can do a more precise
-  // check: just check for relations between the values on corresponding arms.
-  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
-    if (A->getCondition() == SB->getCondition())
-      return related(A->getTrueValue(), SB->getTrueValue()) ||
-             related(A->getFalseValue(), SB->getFalseValue());
-
-  // Check both arms of the Select node individually.
-  return related(A->getTrueValue(), B) ||
-         related(A->getFalseValue(), B);
-}
-
-bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) {
-  // If the values are PHIs in the same block, we can do a more precise as well
-  // as efficient check: just check for relations between the values on
-  // corresponding edges.
-  if (const PHINode *PNB = dyn_cast<PHINode>(B))
-    if (PNB->getParent() == A->getParent()) {
-      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
-        if (related(A->getIncomingValue(i),
-                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
-          return true;
-      return false;
-    }
-
-  // Check each unique source of the PHI node against B.
-  SmallPtrSet<const Value *, 4> UniqueSrc;
-  for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
-    const Value *PV1 = A->getIncomingValue(i);
-    if (UniqueSrc.insert(PV1) && related(PV1, B))
-      return true;
-  }
-
-  // All of the arms checked out.
-  return false;
-}
-
-/// isStoredObjCPointer - Test if the value of P, or any value covered by its
-/// provenance, is ever stored within the function (not counting callees).
-static bool isStoredObjCPointer(const Value *P) {
-  SmallPtrSet<const Value *, 8> Visited;
-  SmallVector<const Value *, 8> Worklist;
-  Worklist.push_back(P);
-  Visited.insert(P);
-  do {
-    P = Worklist.pop_back_val();
-    for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end();
-         UI != UE; ++UI) {
-      const User *Ur = *UI;
-      if (isa<StoreInst>(Ur)) {
-        if (UI.getOperandNo() == 0)
-          // The pointer is stored.
-          return true;
-        // The pointed is stored through.
-        continue;
-      }
-      if (isa<CallInst>(Ur))
-        // The pointer is passed as an argument, ignore this.
-        continue;
-      if (isa<PtrToIntInst>(P))
-        // Assume the worst.
-        return true;
-      if (Visited.insert(Ur))
-        Worklist.push_back(Ur);
-    }
-  } while (!Worklist.empty());
-
-  // Everything checked out.
-  return false;
-}
-
-bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
-  // Skip past provenance pass-throughs.
-  A = GetUnderlyingObjCPtr(A);
-  B = GetUnderlyingObjCPtr(B);
-
-  // Quick check.
-  if (A == B)
-    return true;
-
-  // Ask regular AliasAnalysis, for a first approximation.
-  switch (AA->alias(A, B)) {
-  case AliasAnalysis::NoAlias:
-    return false;
-  case AliasAnalysis::MustAlias:
-  case AliasAnalysis::PartialAlias:
-    return true;
-  case AliasAnalysis::MayAlias:
-    break;
-  }
-
-  bool AIsIdentified = IsObjCIdentifiedObject(A);
-  bool BIsIdentified = IsObjCIdentifiedObject(B);
-
-  // An ObjC-Identified object can't alias a load if it is never locally stored.
-  if (AIsIdentified) {
-    // Check for an obvious escape.
-    if (isa<LoadInst>(B))
-      return isStoredObjCPointer(A);
-    if (BIsIdentified) {
-      // Check for an obvious escape.
-      if (isa<LoadInst>(A))
-        return isStoredObjCPointer(B);
-      // Both pointers are identified and escapes aren't an evident problem.
-      return false;
-    }
-  } else if (BIsIdentified) {
-    // Check for an obvious escape.
-    if (isa<LoadInst>(A))
-      return isStoredObjCPointer(B);
-  }
-
-   // Special handling for PHI and Select.
-  if (const PHINode *PN = dyn_cast<PHINode>(A))
-    return relatedPHI(PN, B);
-  if (const PHINode *PN = dyn_cast<PHINode>(B))
-    return relatedPHI(PN, A);
-  if (const SelectInst *S = dyn_cast<SelectInst>(A))
-    return relatedSelect(S, B);
-  if (const SelectInst *S = dyn_cast<SelectInst>(B))
-    return relatedSelect(S, A);
-
-  // Conservative.
-  return true;
-}
-
-bool ProvenanceAnalysis::related(const Value *A, const Value *B) {
-  // Begin by inserting a conservative value into the map. If the insertion
-  // fails, we have the answer already. If it succeeds, leave it there until we
-  // compute the real answer to guard against recursive queries.
-  if (A > B) std::swap(A, B);
-  std::pair<CachedResultsTy::iterator, bool> Pair =
-    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
-  if (!Pair.second)
-    return Pair.first->second;
-
-  bool Result = relatedCheck(A, B);
-  CachedResults[ValuePairTy(A, B)] = Result;
-  return Result;
-}
-
-namespace {
-  // Sequence - A sequence of states that a pointer may go through in which an
-  // objc_retain and objc_release are actually needed.
-  enum Sequence {
-    S_None,
-    S_Retain,         ///< objc_retain(x)
-    S_CanRelease,     ///< foo(x) -- x could possibly see a ref count decrement
-    S_Use,            ///< any use of x
-    S_Stop,           ///< like S_Release, but code motion is stopped
-    S_Release,        ///< objc_release(x)
-    S_MovableRelease  ///< objc_release(x), !clang.imprecise_release
-  };
-}
-
-static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
-  // The easy cases.
-  if (A == B)
-    return A;
-  if (A == S_None || B == S_None)
-    return S_None;
-
-  if (A > B) std::swap(A, B);
-  if (TopDown) {
-    // Choose the side which is further along in the sequence.
-    if ((A == S_Retain || A == S_CanRelease) &&
-        (B == S_CanRelease || B == S_Use))
-      return B;
-  } else {
-    // Choose the side which is further along in the sequence.
-    if ((A == S_Use || A == S_CanRelease) &&
-        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
-      return A;
-    // If both sides are releases, choose the more conservative one.
-    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
-      return A;
-    if (A == S_Release && B == S_MovableRelease)
-      return A;
-  }
-
-  return S_None;
-}
-
-namespace {
-  /// RRInfo - Unidirectional information about either a
-  /// retain-decrement-use-release sequence or release-use-decrement-retain
-  /// reverese sequence.
-  struct RRInfo {
-    /// KnownSafe - After an objc_retain, the reference count of the referenced
-    /// object is known to be positive. Similarly, before an objc_release, the
-    /// reference count of the referenced object is known to be positive. If
-    /// there are retain-release pairs in code regions where the retain count
-    /// is known to be positive, they can be eliminated, regardless of any side
-    /// effects between them.
-    ///
-    /// Also, a retain+release pair nested within another retain+release
-    /// pair all on the known same pointer value can be eliminated, regardless
-    /// of any intervening side effects.
-    ///
-    /// KnownSafe is true when either of these conditions is satisfied.
-    bool KnownSafe;
-
-    /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as
-    /// opposed to objc_retain calls).
-    bool IsRetainBlock;
-
-    /// IsTailCallRelease - True of the objc_release calls are all marked
-    /// with the "tail" keyword.
-    bool IsTailCallRelease;
-
-    /// ReleaseMetadata - If the Calls are objc_release calls and they all have
-    /// a clang.imprecise_release tag, this is the metadata tag.
-    MDNode *ReleaseMetadata;
-
-    /// Calls - For a top-down sequence, the set of objc_retains or
-    /// objc_retainBlocks. For bottom-up, the set of objc_releases.
-    SmallPtrSet<Instruction *, 2> Calls;
-
-    /// ReverseInsertPts - The set of optimal insert positions for
-    /// moving calls in the opposite sequence.
-    SmallPtrSet<Instruction *, 2> ReverseInsertPts;
-
-    RRInfo() :
-      KnownSafe(false), IsRetainBlock(false),
-      IsTailCallRelease(false),
-      ReleaseMetadata(0) {}
-
-    void clear();
-  };
-}
-
-void RRInfo::clear() {
-  KnownSafe = false;
-  IsRetainBlock = false;
-  IsTailCallRelease = false;
-  ReleaseMetadata = 0;
-  Calls.clear();
-  ReverseInsertPts.clear();
-}
-
-namespace {
-  /// PtrState - This class summarizes several per-pointer runtime properties
-  /// which are propogated through the flow graph.
-  class PtrState {
-    /// KnownPositiveRefCount - True if the reference count is known to
-    /// be incremented.
-    bool KnownPositiveRefCount;
-
-    /// Partial - True of we've seen an opportunity for partial RR elimination,
-    /// such as pushing calls into a CFG triangle or into one side of a
-    /// CFG diamond.
-    bool Partial;
-
-    /// Seq - The current position in the sequence.
-    Sequence Seq : 8;
-
-  public:
-    /// RRI - Unidirectional information about the current sequence.
-    /// TODO: Encapsulate this better.
-    RRInfo RRI;
-
-    PtrState() : KnownPositiveRefCount(false), Partial(false),
-                 Seq(S_None) {}
-
-    void SetKnownPositiveRefCount() {
-      KnownPositiveRefCount = true;
-    }
-
-    void ClearRefCount() {
-      KnownPositiveRefCount = false;
-    }
-
-    bool IsKnownIncremented() const {
-      return KnownPositiveRefCount;
-    }
-
-    void SetSeq(Sequence NewSeq) {
-      Seq = NewSeq;
-    }
-
-    Sequence GetSeq() const {
-      return Seq;
-    }
-
-    void ClearSequenceProgress() {
-      ResetSequenceProgress(S_None);
-    }
-
-    void ResetSequenceProgress(Sequence NewSeq) {
-      Seq = NewSeq;
-      Partial = false;
-      RRI.clear();
-    }
-
-    void Merge(const PtrState &Other, bool TopDown);
-  };
-}
-
-void
-PtrState::Merge(const PtrState &Other, bool TopDown) {
-  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
-  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
-
-  // We can't merge a plain objc_retain with an objc_retainBlock.
-  if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
-    Seq = S_None;
-
-  // If we're not in a sequence (anymore), drop all associated state.
-  if (Seq == S_None) {
-    Partial = false;
-    RRI.clear();
-  } else if (Partial || Other.Partial) {
-    // If we're doing a merge on a path that's previously seen a partial
-    // merge, conservatively drop the sequence, to avoid doing partial
-    // RR elimination. If the branch predicates for the two merge differ,
-    // mixing them is unsafe.
-    ClearSequenceProgress();
-  } else {
-    // Conservatively merge the ReleaseMetadata information.
-    if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
-      RRI.ReleaseMetadata = 0;
-
-    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
-    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
-                            Other.RRI.IsTailCallRelease;
-    RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
-
-    // Merge the insert point sets. If there are any differences,
-    // that makes this a partial merge.
-    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
-    for (SmallPtrSet<Instruction *, 2>::const_iterator
-         I = Other.RRI.ReverseInsertPts.begin(),
-         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
-      Partial |= RRI.ReverseInsertPts.insert(*I);
-  }
-}
-
-namespace {
-  /// BBState - Per-BasicBlock state.
-  class BBState {
-    /// TopDownPathCount - The number of unique control paths from the entry
-    /// which can reach this block.
-    unsigned TopDownPathCount;
-
-    /// BottomUpPathCount - The number of unique control paths to exits
-    /// from this block.
-    unsigned BottomUpPathCount;
-
-    /// MapTy - A type for PerPtrTopDown and PerPtrBottomUp.
-    typedef MapVector<const Value *, PtrState> MapTy;
-
-    /// PerPtrTopDown - The top-down traversal uses this to record information
-    /// known about a pointer at the bottom of each block.
-    MapTy PerPtrTopDown;
-
-    /// PerPtrBottomUp - The bottom-up traversal uses this to record information
-    /// known about a pointer at the top of each block.
-    MapTy PerPtrBottomUp;
-
-    /// Preds, Succs - Effective successors and predecessors of the current
-    /// block (this ignores ignorable edges and ignored backedges).
-    SmallVector<BasicBlock *, 2> Preds;
-    SmallVector<BasicBlock *, 2> Succs;
-
-  public:
-    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
-
-    typedef MapTy::iterator ptr_iterator;
-    typedef MapTy::const_iterator ptr_const_iterator;
-
-    ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
-    ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
-    ptr_const_iterator top_down_ptr_begin() const {
-      return PerPtrTopDown.begin();
-    }
-    ptr_const_iterator top_down_ptr_end() const {
-      return PerPtrTopDown.end();
-    }
-
-    ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); }
-    ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
-    ptr_const_iterator bottom_up_ptr_begin() const {
-      return PerPtrBottomUp.begin();
-    }
-    ptr_const_iterator bottom_up_ptr_end() const {
-      return PerPtrBottomUp.end();
-    }
-
-    /// SetAsEntry - Mark this block as being an entry block, which has one
-    /// path from the entry by definition.
-    void SetAsEntry() { TopDownPathCount = 1; }
-
-    /// SetAsExit - Mark this block as being an exit block, which has one
-    /// path to an exit by definition.
-    void SetAsExit()  { BottomUpPathCount = 1; }
-
-    PtrState &getPtrTopDownState(const Value *Arg) {
-      return PerPtrTopDown[Arg];
-    }
-
-    PtrState &getPtrBottomUpState(const Value *Arg) {
-      return PerPtrBottomUp[Arg];
-    }
-
-    void clearBottomUpPointers() {
-      PerPtrBottomUp.clear();
-    }
-
-    void clearTopDownPointers() {
-      PerPtrTopDown.clear();
-    }
-
-    void InitFromPred(const BBState &Other);
-    void InitFromSucc(const BBState &Other);
-    void MergePred(const BBState &Other);
-    void MergeSucc(const BBState &Other);
-
-    /// GetAllPathCount - Return the number of possible unique paths from an
-    /// entry to an exit which pass through this block. This is only valid
-    /// after both the top-down and bottom-up traversals are complete.
-    unsigned GetAllPathCount() const {
-      assert(TopDownPathCount != 0);
-      assert(BottomUpPathCount != 0);
-      return TopDownPathCount * BottomUpPathCount;
-    }
-
-    // Specialized CFG utilities.
-    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
-    edge_iterator pred_begin() { return Preds.begin(); }
-    edge_iterator pred_end() { return Preds.end(); }
-    edge_iterator succ_begin() { return Succs.begin(); }
-    edge_iterator succ_end() { return Succs.end(); }
-
-    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
-    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
-
-    bool isExit() const { return Succs.empty(); }
-  };
-}
-
-void BBState::InitFromPred(const BBState &Other) {
-  PerPtrTopDown = Other.PerPtrTopDown;
-  TopDownPathCount = Other.TopDownPathCount;
-}
-
-void BBState::InitFromSucc(const BBState &Other) {
-  PerPtrBottomUp = Other.PerPtrBottomUp;
-  BottomUpPathCount = Other.BottomUpPathCount;
-}
-
-/// MergePred - The top-down traversal uses this to merge information about
-/// predecessors to form the initial state for a new block.
-void BBState::MergePred(const BBState &Other) {
-  // Other.TopDownPathCount can be 0, in which case it is either dead or a
-  // loop backedge. Loop backedges are special.
-  TopDownPathCount += Other.TopDownPathCount;
-
-  // Check for overflow. If we have overflow, fall back to conservative behavior.
-  if (TopDownPathCount < Other.TopDownPathCount) {
-    clearTopDownPointers();
-    return;
-  }
-
-  // For each entry in the other set, if our set has an entry with the same key,
-  // merge the entries. Otherwise, copy the entry and merge it with an empty
-  // entry.
-  for (ptr_const_iterator MI = Other.top_down_ptr_begin(),
-       ME = Other.top_down_ptr_end(); MI != ME; ++MI) {
-    std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI);
-    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
-                             /*TopDown=*/true);
-  }
-
-  // For each entry in our set, if the other set doesn't have an entry with the
-  // same key, force it to merge with an empty entry.
-  for (ptr_iterator MI = top_down_ptr_begin(),
-       ME = top_down_ptr_end(); MI != ME; ++MI)
-    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
-      MI->second.Merge(PtrState(), /*TopDown=*/true);
-}
-
-/// MergeSucc - The bottom-up traversal uses this to merge information about
-/// successors to form the initial state for a new block.
-void BBState::MergeSucc(const BBState &Other) {
-  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
-  // loop backedge. Loop backedges are special.
-  BottomUpPathCount += Other.BottomUpPathCount;
-
-  // Check for overflow. If we have overflow, fall back to conservative behavior.
-  if (BottomUpPathCount < Other.BottomUpPathCount) {
-    clearBottomUpPointers();
-    return;
-  }
-
-  // For each entry in the other set, if our set has an entry with the
-  // same key, merge the entries. Otherwise, copy the entry and merge
-  // it with an empty entry.
-  for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(),
-       ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) {
-    std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI);
-    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
-                             /*TopDown=*/false);
-  }
-
-  // For each entry in our set, if the other set doesn't have an entry
-  // with the same key, force it to merge with an empty entry.
-  for (ptr_iterator MI = bottom_up_ptr_begin(),
-       ME = bottom_up_ptr_end(); MI != ME; ++MI)
-    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
-      MI->second.Merge(PtrState(), /*TopDown=*/false);
-}
-
-namespace {
-  /// ObjCARCOpt - The main ARC optimization pass.
-  class ObjCARCOpt : public FunctionPass {
-    bool Changed;
-    ProvenanceAnalysis PA;
-
-    /// Run - A flag indicating whether this optimization pass should run.
-    bool Run;
-
-    /// RetainRVCallee, etc. - Declarations for ObjC runtime
-    /// functions, for use in creating calls to them. These are initialized
-    /// lazily to avoid cluttering up the Module with unused declarations.
-    Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee,
-             *RetainCallee, *RetainBlockCallee, *AutoreleaseCallee;
-
-    /// UsedInThisFunciton - Flags which determine whether each of the
-    /// interesting runtine functions is in fact used in the current function.
-    unsigned UsedInThisFunction;
-
-    /// ImpreciseReleaseMDKind - The Metadata Kind for clang.imprecise_release
-    /// metadata.
-    unsigned ImpreciseReleaseMDKind;
-
-    /// CopyOnEscapeMDKind - The Metadata Kind for clang.arc.copy_on_escape
-    /// metadata.
-    unsigned CopyOnEscapeMDKind;
-
-    /// NoObjCARCExceptionsMDKind - The Metadata Kind for
-    /// clang.arc.no_objc_arc_exceptions metadata.
-    unsigned NoObjCARCExceptionsMDKind;
-
-    Constant *getRetainRVCallee(Module *M);
-    Constant *getAutoreleaseRVCallee(Module *M);
-    Constant *getReleaseCallee(Module *M);
-    Constant *getRetainCallee(Module *M);
-    Constant *getRetainBlockCallee(Module *M);
-    Constant *getAutoreleaseCallee(Module *M);
-
-    bool IsRetainBlockOptimizable(const Instruction *Inst);
-
-    void OptimizeRetainCall(Function &F, Instruction *Retain);
-    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
-    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV);
-    void OptimizeIndividualCalls(Function &F);
-
-    void CheckForCFGHazards(const BasicBlock *BB,
-                            DenseMap<const BasicBlock *, BBState> &BBStates,
-                            BBState &MyStates) const;
-    bool VisitInstructionBottomUp(Instruction *Inst,
-                                  BasicBlock *BB,
-                                  MapVector<Value *, RRInfo> &Retains,
-                                  BBState &MyStates);
-    bool VisitBottomUp(BasicBlock *BB,
-                       DenseMap<const BasicBlock *, BBState> &BBStates,
-                       MapVector<Value *, RRInfo> &Retains);
-    bool VisitInstructionTopDown(Instruction *Inst,
-                                 DenseMap<Value *, RRInfo> &Releases,
-                                 BBState &MyStates);
-    bool VisitTopDown(BasicBlock *BB,
-                      DenseMap<const BasicBlock *, BBState> &BBStates,
-                      DenseMap<Value *, RRInfo> &Releases);
-    bool Visit(Function &F,
-               DenseMap<const BasicBlock *, BBState> &BBStates,
-               MapVector<Value *, RRInfo> &Retains,
-               DenseMap<Value *, RRInfo> &Releases);
-
-    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
-                   MapVector<Value *, RRInfo> &Retains,
-                   DenseMap<Value *, RRInfo> &Releases,
-                   SmallVectorImpl<Instruction *> &DeadInsts,
-                   Module *M);
-
-    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
-                              MapVector<Value *, RRInfo> &Retains,
-                              DenseMap<Value *, RRInfo> &Releases,
-                              Module *M);
-
-    void OptimizeWeakCalls(Function &F);
-
-    bool OptimizeSequences(Function &F);
-
-    void OptimizeReturns(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-    virtual void releaseMemory();
-
-  public:
-    static char ID;
-    ObjCARCOpt() : FunctionPass(ID) {
-      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCOpt::ID = 0;
-INITIALIZE_PASS_BEGIN(ObjCARCOpt,
-                      "objc-arc", "ObjC ARC optimization", false, false)
-INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis)
-INITIALIZE_PASS_END(ObjCARCOpt,
-                    "objc-arc", "ObjC ARC optimization", false, false)
-
-Pass *llvm::createObjCARCOptPass() {
-  return new ObjCARCOpt();
-}
-
-void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<ObjCARCAliasAnalysis>();
-  AU.addRequired<AliasAnalysis>();
-  // ARC optimization doesn't currently split critical edges.
-  AU.setPreservesCFG();
-}
-
-bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
-  // Without the magic metadata tag, we have to assume this might be an
-  // objc_retainBlock call inserted to convert a block pointer to an id,
-  // in which case it really is needed.
-  if (!Inst->getMetadata(CopyOnEscapeMDKind))
-    return false;
-
-  // If the pointer "escapes" (not including being used in a call),
-  // the copy may be needed.
-  if (DoesObjCBlockEscape(Inst))
-    return false;
-
-  // Otherwise, it's not needed.
-  return true;
-}
-
-Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
-  if (!RetainRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    RetainRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
-                             Attribute);
-  }
-  return RetainRVCallee;
-}
-
-Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
-  if (!AutoreleaseRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    AutoreleaseRVCallee =
-      M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
-                             Attribute);
-  }
-  return AutoreleaseRVCallee;
-}
-
-Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
-  if (!ReleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    ReleaseCallee =
-      M->getOrInsertFunction(
-        "objc_release",
-        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return ReleaseCallee;
-}
-
-Constant *ObjCARCOpt::getRetainCallee(Module *M) {
-  if (!RetainCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    RetainCallee =
-      M->getOrInsertFunction(
-        "objc_retain",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return RetainCallee;
-}
-
-Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
-  if (!RetainBlockCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    // objc_retainBlock is not nounwind because it calls user copy constructors
-    // which could theoretically throw.
-    RetainBlockCallee =
-      M->getOrInsertFunction(
-        "objc_retainBlock",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        AttributeSet());
-  }
-  return RetainBlockCallee;
-}
-
-Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
-  if (!AutoreleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    AutoreleaseCallee =
-      M->getOrInsertFunction(
-        "objc_autorelease",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return AutoreleaseCallee;
-}
-
-/// IsPotentialUse - Test whether the given value is possible a
-/// reference-counted pointer, including tests which utilize AliasAnalysis.
-static bool IsPotentialUse(const Value *Op, AliasAnalysis &AA) {
-  // First make the rudimentary check.
-  if (!IsPotentialUse(Op))
-    return false;
-
-  // Objects in constant memory are not reference-counted.
-  if (AA.pointsToConstantMemory(Op))
-    return false;
-
-  // Pointers in constant memory are not pointing to reference-counted objects.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
-    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
-      return false;
-
-  // Otherwise assume the worst.
-  return true;
-}
-
-/// CanAlterRefCount - Test whether the given instruction can result in a
-/// reference count modification (positive or negative) for the pointer's
-/// object.
-static bool
-CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
-                 ProvenanceAnalysis &PA, InstructionClass Class) {
-  switch (Class) {
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_User:
-    // These operations never directly modify a reference count.
-    return false;
-  default: break;
-  }
-
-  ImmutableCallSite CS = static_cast<const Value *>(Inst);
-  assert(CS && "Only calls can alter reference counts!");
-
-  // See if AliasAnalysis can help us with the call.
-  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
-  if (AliasAnalysis::onlyReadsMemory(MRB))
-    return false;
-  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
-    for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
-         I != E; ++I) {
-      const Value *Op = *I;
-      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
-        return true;
-    }
-    return false;
-  }
-
-  // Assume the worst.
-  return true;
-}
-
-/// CanUse - Test whether the given instruction can "use" the given pointer's
-/// object in a way that requires the reference count to be positive.
-static bool
-CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
-       InstructionClass Class) {
-  // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers.
-  if (Class == IC_Call)
-    return false;
-
-  // Consider various instructions which may have pointer arguments which are
-  // not "uses".
-  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
-    // Comparing a pointer with null, or any other constant, isn't really a use,
-    // because we don't care what the pointer points to, or about the values
-    // of any other dynamic reference-counted pointers.
-    if (!IsPotentialUse(ICI->getOperand(1), *PA.getAA()))
-      return false;
-  } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
-    // For calls, just check the arguments (and not the callee operand).
-    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
-         OE = CS.arg_end(); OI != OE; ++OI) {
-      const Value *Op = *OI;
-      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
-        return true;
-    }
-    return false;
-  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    // Special-case stores, because we don't care about the stored value, just
-    // the store address.
-    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
-    // If we can't tell what the underlying object was, assume there is a
-    // dependence.
-    return IsPotentialUse(Op, *PA.getAA()) && PA.related(Op, Ptr);
-  }
-
-  // Check each operand for a match.
-  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
-       OI != OE; ++OI) {
-    const Value *Op = *OI;
-    if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
-      return true;
-  }
-  return false;
-}
-
-/// CanInterruptRV - Test whether the given instruction can autorelease
-/// any pointer or cause an autoreleasepool pop.
-static bool
-CanInterruptRV(InstructionClass Class) {
-  switch (Class) {
-  case IC_AutoreleasepoolPop:
-  case IC_CallOrUser:
-  case IC_Call:
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_FusedRetainAutorelease:
-  case IC_FusedRetainAutoreleaseRV:
-    return true;
-  default:
-    return false;
-  }
-}
-
-namespace {
-  /// DependenceKind - There are several kinds of dependence-like concepts in
-  /// use here.
-  enum DependenceKind {
-    NeedsPositiveRetainCount,
-    AutoreleasePoolBoundary,
-    CanChangeRetainCount,
-    RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
-    RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
-    RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
-  };
-}
-
-/// Depends - Test if there can be dependencies on Inst through Arg. This
-/// function only tests dependencies relevant for removing pairs of calls.
-static bool
-Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
-        ProvenanceAnalysis &PA) {
-  // If we've reached the definition of Arg, stop.
-  if (Inst == Arg)
-    return true;
-
-  switch (Flavor) {
-  case NeedsPositiveRetainCount: {
-    InstructionClass Class = GetInstructionClass(Inst);
-    switch (Class) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      return false;
-    default:
-      return CanUse(Inst, Arg, PA, Class);
-    }
-  }
-
-  case AutoreleasePoolBoundary: {
-    InstructionClass Class = GetInstructionClass(Inst);
-    switch (Class) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-      // These mark the end and begin of an autorelease pool scope.
-      return true;
-    default:
-      // Nothing else does this.
-      return false;
-    }
-  }
-
-  case CanChangeRetainCount: {
-    InstructionClass Class = GetInstructionClass(Inst);
-    switch (Class) {
-    case IC_AutoreleasepoolPop:
-      // Conservatively assume this can decrement any count.
-      return true;
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      return false;
-    default:
-      return CanAlterRefCount(Inst, Arg, PA, Class);
-    }
-  }
-
-  case RetainAutoreleaseDep:
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-      // Don't merge an objc_autorelease with an objc_retain inside a different
-      // autoreleasepool scope.
-      return true;
-    case IC_Retain:
-    case IC_RetainRV:
-      // Check for a retain of the same pointer for merging.
-      return GetObjCArg(Inst) == Arg;
-    default:
-      // Nothing else matters for objc_retainAutorelease formation.
-      return false;
-    }
-
-  case RetainAutoreleaseRVDep: {
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    switch (Class) {
-    case IC_Retain:
-    case IC_RetainRV:
-      // Check for a retain of the same pointer for merging.
-      return GetObjCArg(Inst) == Arg;
-    default:
-      // Anything that can autorelease interrupts
-      // retainAutoreleaseReturnValue formation.
-      return CanInterruptRV(Class);
-    }
-  }
-
-  case RetainRVDep:
-    return CanInterruptRV(GetBasicInstructionClass(Inst));
-  }
-
-  llvm_unreachable("Invalid dependence flavor");
-}
-
-/// FindDependencies - Walk up the CFG from StartPos (which is in StartBB) and
-/// find local and non-local dependencies on Arg.
-/// TODO: Cache results?
-static void
-FindDependencies(DependenceKind Flavor,
-                 const Value *Arg,
-                 BasicBlock *StartBB, Instruction *StartInst,
-                 SmallPtrSet<Instruction *, 4> &DependingInstructions,
-                 SmallPtrSet<const BasicBlock *, 4> &Visited,
-                 ProvenanceAnalysis &PA) {
-  BasicBlock::iterator StartPos = StartInst;
-
-  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
-  Worklist.push_back(std::make_pair(StartBB, StartPos));
-  do {
-    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
-      Worklist.pop_back_val();
-    BasicBlock *LocalStartBB = Pair.first;
-    BasicBlock::iterator LocalStartPos = Pair.second;
-    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
-    for (;;) {
-      if (LocalStartPos == StartBBBegin) {
-        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
-        if (PI == PE)
-          // If we've reached the function entry, produce a null dependence.
-          DependingInstructions.insert(0);
-        else
-          // Add the predecessors to the worklist.
-          do {
-            BasicBlock *PredBB = *PI;
-            if (Visited.insert(PredBB))
-              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
-          } while (++PI != PE);
-        break;
-      }
-
-      Instruction *Inst = --LocalStartPos;
-      if (Depends(Flavor, Inst, Arg, PA)) {
-        DependingInstructions.insert(Inst);
-        break;
-      }
-    }
-  } while (!Worklist.empty());
-
-  // Determine whether the original StartBB post-dominates all of the blocks we
-  // visited. If not, insert a sentinal indicating that most optimizations are
-  // not safe.
-  for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(),
-       E = Visited.end(); I != E; ++I) {
-    const BasicBlock *BB = *I;
-    if (BB == StartBB)
-      continue;
-    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
-      const BasicBlock *Succ = *SI;
-      if (Succ != StartBB && !Visited.count(Succ)) {
-        DependingInstructions.insert(reinterpret_cast<Instruction *>(-1));
-        return;
-      }
-    }
-  }
-}
-
-static bool isNullOrUndef(const Value *V) {
-  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
-}
-
-static bool isNoopInstruction(const Instruction *I) {
-  return isa<BitCastInst>(I) ||
-         (isa<GetElementPtrInst>(I) &&
-          cast<GetElementPtrInst>(I)->hasAllZeroIndices());
-}
-
-/// OptimizeRetainCall - Turn objc_retain into
-/// objc_retainAutoreleasedReturnValue if the operand is a return value.
-void
-ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
-  ImmutableCallSite CS(GetObjCArg(Retain));
-  const Instruction *Call = CS.getInstruction();
-  if (!Call) return;
-  if (Call->getParent() != Retain->getParent()) return;
-
-  // Check that the call is next to the retain.
-  BasicBlock::const_iterator I = Call;
-  ++I;
-  while (isNoopInstruction(I)) ++I;
-  if (&*I != Retain)
-    return;
-
-  // Turn it to an objc_retainAutoreleasedReturnValue..
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming "
-                  "objc_retainAutoreleasedReturnValue => "
-                  "objc_retain since the operand is not a return value.\n"
-                  "                                Old: "
-               << *Retain << "\n");
-
-  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                New: "
-               << *Retain << "\n");
-}
-
-/// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into
-/// objc_retain if the operand is not a return value.  Or, if it can be paired
-/// with an objc_autoreleaseReturnValue, delete the pair and return true.
-bool
-ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
-  // Check for the argument being from an immediately preceding call or invoke.
-  const Value *Arg = GetObjCArg(RetainRV);
-  ImmutableCallSite CS(Arg);
-  if (const Instruction *Call = CS.getInstruction()) {
-    if (Call->getParent() == RetainRV->getParent()) {
-      BasicBlock::const_iterator I = Call;
-      ++I;
-      while (isNoopInstruction(I)) ++I;
-      if (&*I == RetainRV)
-        return false;
-    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      BasicBlock *RetainRVParent = RetainRV->getParent();
-      if (II->getNormalDest() == RetainRVParent) {
-        BasicBlock::const_iterator I = RetainRVParent->begin();
-        while (isNoopInstruction(I)) ++I;
-        if (&*I == RetainRV)
-          return false;
-      }
-    }
-  }
-
-  // Check for being preceded by an objc_autoreleaseReturnValue on the same
-  // pointer. In this case, we can delete the pair.
-  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
-  if (I != Begin) {
-    do --I; while (I != Begin && isNoopInstruction(I));
-    if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
-        GetObjCArg(I) == Arg) {
-      Changed = true;
-      ++NumPeeps;
-
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n"
-                   << "                                  Erasing " << *RetainRV
-                   << "\n");
-
-      EraseInstruction(I);
-      EraseInstruction(RetainRV);
-      return true;
-    }
-  }
-
-  // Turn it to a plain objc_retain.
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming "
-                  "objc_retainAutoreleasedReturnValue => "
-                  "objc_retain since the operand is not a return value.\n"
-                  "                                  Old: "
-               << *RetainRV << "\n");
-
-  cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                  New: "
-               << *RetainRV << "\n");
-
-  return false;
-}
-
-/// OptimizeAutoreleaseRVCall - Turn objc_autoreleaseReturnValue into
-/// objc_autorelease if the result is not used as a return value.
-void
-ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) {
-  // Check for a return of the pointer value.
-  const Value *Ptr = GetObjCArg(AutoreleaseRV);
-  SmallVector<const Value *, 2> Users;
-  Users.push_back(Ptr);
-  do {
-    Ptr = Users.pop_back_val();
-    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
-         UI != UE; ++UI) {
-      const User *I = *UI;
-      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
-        return;
-      if (isa<BitCastInst>(I))
-        Users.push_back(I);
-    }
-  } while (!Users.empty());
-
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming "
-                  "objc_autoreleaseReturnValue => "
-                  "objc_autorelease since its operand is not used as a return "
-                  "value.\n"
-                  "                                       Old: "
-               << *AutoreleaseRV << "\n");
-
-  cast<CallInst>(AutoreleaseRV)->
-    setCalledFunction(getAutoreleaseCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                       New: "
-               << *AutoreleaseRV << "\n");
-
-}
-
-/// OptimizeIndividualCalls - Visit each call, one at a time, and make
-/// simplifications without doing any additional analysis.
-void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
-  // Reset all the flags in preparation for recomputing them.
-  UsedInThisFunction = 0;
-
-  // Visit all objc_* calls in F.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: " <<
-          *Inst << "\n");
-
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-
-    switch (Class) {
-    default: break;
-
-    // Delete no-op casts. These function calls have special semantics, but
-    // the semantics are entirely implemented via lowering in the front-end,
-    // so by the time they reach the optimizer, they are just no-op calls
-    // which return their argument.
-    //
-    // There are gray areas here, as the ability to cast reference-counted
-    // pointers to raw void* and back allows code to break ARC assumptions,
-    // however these are currently considered to be unimportant.
-    case IC_NoopCast:
-      Changed = true;
-      ++NumNoops;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:"
-                   " " << *Inst << "\n");
-      EraseInstruction(Inst);
-      continue;
-
-    // If the pointer-to-weak-pointer is null, it's undefined behavior.
-    case IC_StoreWeak:
-    case IC_LoadWeak:
-    case IC_LoadWeakRetained:
-    case IC_InitWeak:
-    case IC_DestroyWeak: {
-      CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0))) {
-        Changed = true;
-        Type *Ty = CI->getArgOperand(0)->getType();
-        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                      Constant::getNullValue(Ty),
-                      CI);
-        llvm::Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
-                        "pointer-to-weak-pointer is undefined behavior.\n"
-                        "                                     Old = " << *CI <<
-                        "\n                                     New = " <<
-                        *NewValue << "\n");
-        CI->replaceAllUsesWith(NewValue);
-        CI->eraseFromParent();
-        continue;
-      }
-      break;
-    }
-    case IC_CopyWeak:
-    case IC_MoveWeak: {
-      CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0)) ||
-          isNullOrUndef(CI->getArgOperand(1))) {
-        Changed = true;
-        Type *Ty = CI->getArgOperand(0)->getType();
-        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                      Constant::getNullValue(Ty),
-                      CI);
-
-        llvm::Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
-                        "pointer-to-weak-pointer is undefined behavior.\n"
-                        "                                     Old = " << *CI <<
-                        "\n                                     New = " <<
-                        *NewValue << "\n");
-
-        CI->replaceAllUsesWith(NewValue);
-        CI->eraseFromParent();
-        continue;
-      }
-      break;
-    }
-    case IC_Retain:
-      OptimizeRetainCall(F, Inst);
-      break;
-    case IC_RetainRV:
-      if (OptimizeRetainRVCall(F, Inst))
-        continue;
-      break;
-    case IC_AutoreleaseRV:
-      OptimizeAutoreleaseRVCall(F, Inst);
-      break;
-    }
-
-    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
-    if (IsAutorelease(Class) && Inst->use_empty()) {
-      CallInst *Call = cast<CallInst>(Inst);
-      const Value *Arg = Call->getArgOperand(0);
-      Arg = FindSingleUseIdentifiedObject(Arg);
-      if (Arg) {
-        Changed = true;
-        ++NumAutoreleases;
-
-        // Create the declaration lazily.
-        LLVMContext &C = Inst->getContext();
-        CallInst *NewCall =
-          CallInst::Create(getReleaseCallee(F.getParent()),
-                           Call->getArgOperand(0), "", Call);
-        NewCall->setMetadata(ImpreciseReleaseMDKind,
-                             MDNode::get(C, ArrayRef<Value *>()));
-
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing "
-                        "objc_autorelease(x) with objc_release(x) since x is "
-                        "otherwise unused.\n"
-                        "                                     Old: " << *Call <<
-                        "\n                                     New: " <<
-                        *NewCall << "\n");
-
-        EraseInstruction(Call);
-        Inst = NewCall;
-        Class = IC_Release;
-      }
-    }
-
-    // For functions which can never be passed stack arguments, add
-    // a tail keyword.
-    if (IsAlwaysTail(Class)) {
-      Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword"
-            " to function since it can never be passed stack args: " << *Inst <<
-            "\n");
-      cast<CallInst>(Inst)->setTailCall();
-    }
-
-    // Set nounwind as needed.
-    if (IsNoThrow(Class)) {
-      Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw"
-            " class. Setting nounwind on: " << *Inst << "\n");
-      cast<CallInst>(Inst)->setDoesNotThrow();
-    }
-
-    if (!IsNoopOnNull(Class)) {
-      UsedInThisFunction |= 1 << Class;
-      continue;
-    }
-
-    const Value *Arg = GetObjCArg(Inst);
-
-    // ARC calls with null are no-ops. Delete them.
-    if (isNullOrUndef(Arg)) {
-      Changed = true;
-      ++NumNoops;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with "
-            " null are no-ops. Erasing: " << *Inst << "\n");
-      EraseInstruction(Inst);
-      continue;
-    }
-
-    // Keep track of which of retain, release, autorelease, and retain_block
-    // are actually present in this function.
-    UsedInThisFunction |= 1 << Class;
-
-    // If Arg is a PHI, and one or more incoming values to the
-    // PHI are null, and the call is control-equivalent to the PHI, and there
-    // are no relevant side effects between the PHI and the call, the call
-    // could be pushed up to just those paths with non-null incoming values.
-    // For now, don't bother splitting critical edges for this.
-    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
-    Worklist.push_back(std::make_pair(Inst, Arg));
-    do {
-      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
-      Inst = Pair.first;
-      Arg = Pair.second;
-
-      const PHINode *PN = dyn_cast<PHINode>(Arg);
-      if (!PN) continue;
-
-      // Determine if the PHI has any null operands, or any incoming
-      // critical edges.
-      bool HasNull = false;
-      bool HasCriticalEdges = false;
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-        Value *Incoming =
-          StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-        if (isNullOrUndef(Incoming))
-          HasNull = true;
-        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
-                   .getNumSuccessors() != 1) {
-          HasCriticalEdges = true;
-          break;
-        }
-      }
-      // If we have null operands and no critical edges, optimize.
-      if (!HasCriticalEdges && HasNull) {
-        SmallPtrSet<Instruction *, 4> DependingInstructions;
-        SmallPtrSet<const BasicBlock *, 4> Visited;
-
-        // Check that there is nothing that cares about the reference
-        // count between the call and the phi.
-        switch (Class) {
-        case IC_Retain:
-        case IC_RetainBlock:
-          // These can always be moved up.
-          break;
-        case IC_Release:
-          // These can't be moved across things that care about the retain
-          // count.
-          FindDependencies(NeedsPositiveRetainCount, Arg,
-                           Inst->getParent(), Inst,
-                           DependingInstructions, Visited, PA);
-          break;
-        case IC_Autorelease:
-          // These can't be moved across autorelease pool scope boundaries.
-          FindDependencies(AutoreleasePoolBoundary, Arg,
-                           Inst->getParent(), Inst,
-                           DependingInstructions, Visited, PA);
-          break;
-        case IC_RetainRV:
-        case IC_AutoreleaseRV:
-          // Don't move these; the RV optimization depends on the autoreleaseRV
-          // being tail called, and the retainRV being immediately after a call
-          // (which might still happen if we get lucky with codegen layout, but
-          // it's not worth taking the chance).
-          continue;
-        default:
-          llvm_unreachable("Invalid dependence flavor");
-        }
-
-        if (DependingInstructions.size() == 1 &&
-            *DependingInstructions.begin() == PN) {
-          Changed = true;
-          ++NumPartialNoops;
-          // Clone the call into each predecessor that has a non-null value.
-          CallInst *CInst = cast<CallInst>(Inst);
-          Type *ParamTy = CInst->getArgOperand(0)->getType();
-          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-            Value *Incoming =
-              StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-            if (!isNullOrUndef(Incoming)) {
-              CallInst *Clone = cast<CallInst>(CInst->clone());
-              Value *Op = PN->getIncomingValue(i);
-              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
-              if (Op->getType() != ParamTy)
-                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
-              Clone->setArgOperand(0, Op);
-              Clone->insertBefore(InsertPos);
-              Worklist.push_back(std::make_pair(Clone, Incoming));
-            }
-          }
-          // Erase the original call.
-          EraseInstruction(CInst);
-          continue;
-        }
-      }
-    } while (!Worklist.empty());
-
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished Queue.\n\n");
-
-  }
-}
-
-/// CheckForCFGHazards - Check for critical edges, loop boundaries, irreducible
-/// control flow, or other CFG structures where moving code across the edge
-/// would result in it being executed more.
-void
-ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
-                               DenseMap<const BasicBlock *, BBState> &BBStates,
-                               BBState &MyStates) const {
-  // If any top-down local-use or possible-dec has a succ which is earlier in
-  // the sequence, forget it.
-  for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
-       E = MyStates.top_down_ptr_end(); I != E; ++I)
-    switch (I->second.GetSeq()) {
-    default: break;
-    case S_Use: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      // If the terminator is an invoke marked with the
-      // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-      // ignored, for ARC purposes.
-      if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-        --SE;
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None:
-        case S_CanRelease: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
-          continue;
-        }
-        case S_Use:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
-        S.ClearSequenceProgress();
-      break;
-    }
-    case S_CanRelease: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      // If the terminator is an invoke marked with the
-      // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-      // ignored, for ARC purposes.
-      if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-        --SE;
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
-          continue;
-        }
-        case S_CanRelease:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-        case S_Use:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
-        S.ClearSequenceProgress();
-      break;
-    }
-    }
-}
-
-bool
-ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
-                                     BasicBlock *BB,
-                                     MapVector<Value *, RRInfo> &Retains,
-                                     BBState &MyStates) {
-  bool NestingDetected = false;
-  InstructionClass Class = GetInstructionClass(Inst);
-  const Value *Arg = 0;
-
-  switch (Class) {
-  case IC_Release: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrBottomUpState(Arg);
-
-    // If we see two releases in a row on the same pointer. If so, make
-    // a note, and we'll cicle back to revisit it after we've
-    // hopefully eliminated the second release, which may allow us to
-    // eliminate the first release too.
-    // Theoretically we could implement removal of nested retain+release
-    // pairs by making PtrState hold a stack of states, but this is
-    // simple and avoids adding overhead for the non-nested case.
-    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
-      NestingDetected = true;
-
-    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-    S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
-    S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.IsKnownIncremented();
-    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-    S.RRI.Calls.insert(Inst);
-
-    S.SetKnownPositiveRefCount();
-    break;
-  }
-  case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
-  case IC_Retain:
-  case IC_RetainRV: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrBottomUpState(Arg);
-    S.SetKnownPositiveRefCount();
-
-    switch (S.GetSeq()) {
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-    case S_Use:
-      S.RRI.ReverseInsertPts.clear();
-      // FALL THROUGH
-    case S_CanRelease:
-      // Don't do retain+release tracking for IC_RetainRV, because it's
-      // better to let it remain as the first instruction after a call.
-      if (Class != IC_RetainRV) {
-        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-        Retains[Inst] = S.RRI;
-      }
-      S.ClearSequenceProgress();
-      break;
-    case S_None:
-      break;
-    case S_Retain:
-      llvm_unreachable("bottom-up pointer in retain state!");
-    }
-    return NestingDetected;
-  }
-  case IC_AutoreleasepoolPop:
-    // Conservatively, clear MyStates for all known pointers.
-    MyStates.clearBottomUpPointers();
-    return NestingDetected;
-  case IC_AutoreleasepoolPush:
-  case IC_None:
-    // These are irrelevant.
-    return NestingDetected;
-  default:
-    break;
-  }
-
-  // Consider any other possible effects of this instruction on each
-  // pointer being tracked.
-  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
-       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
-    const Value *Ptr = MI->first;
-    if (Ptr == Arg)
-      continue; // Handled above.
-    PtrState &S = MI->second;
-    Sequence Seq = S.GetSeq();
-
-    // Check for possible releases.
-    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
-      switch (Seq) {
-      case S_Use:
-        S.SetSeq(S_CanRelease);
-        continue;
-      case S_CanRelease:
-      case S_Release:
-      case S_MovableRelease:
-      case S_Stop:
-      case S_None:
-        break;
-      case S_Retain:
-        llvm_unreachable("bottom-up pointer in retain state!");
-      }
-    }
-
-    // Check for possible direct uses.
-    switch (Seq) {
-    case S_Release:
-    case S_MovableRelease:
-      if (CanUse(Inst, Ptr, PA, Class)) {
-        assert(S.RRI.ReverseInsertPts.empty());
-        // If this is an invoke instruction, we're scanning it as part of
-        // one of its successor blocks, since we can't insert code after it
-        // in its own block, and we don't want to split critical edges.
-        if (isa<InvokeInst>(Inst))
-          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
-        else
-          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
-        S.SetSeq(S_Use);
-      } else if (Seq == S_Release &&
-                 (Class == IC_User || Class == IC_CallOrUser)) {
-        // Non-movable releases depend on any possible objc pointer use.
-        S.SetSeq(S_Stop);
-        assert(S.RRI.ReverseInsertPts.empty());
-        // As above; handle invoke specially.
-        if (isa<InvokeInst>(Inst))
-          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
-        else
-          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
-      }
-      break;
-    case S_Stop:
-      if (CanUse(Inst, Ptr, PA, Class))
-        S.SetSeq(S_Use);
-      break;
-    case S_CanRelease:
-    case S_Use:
-    case S_None:
-      break;
-    case S_Retain:
-      llvm_unreachable("bottom-up pointer in retain state!");
-    }
-  }
-
-  return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
-                          DenseMap<const BasicBlock *, BBState> &BBStates,
-                          MapVector<Value *, RRInfo> &Retains) {
-  bool NestingDetected = false;
-  BBState &MyStates = BBStates[BB];
-
-  // Merge the states from each successor to compute the initial state
-  // for the current block.
-  BBState::edge_iterator SI(MyStates.succ_begin()),
-                         SE(MyStates.succ_end());
-  if (SI != SE) {
-    const BasicBlock *Succ = *SI;
-    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
-    assert(I != BBStates.end());
-    MyStates.InitFromSucc(I->second);
-    ++SI;
-    for (; SI != SE; ++SI) {
-      Succ = *SI;
-      I = BBStates.find(Succ);
-      assert(I != BBStates.end());
-      MyStates.MergeSucc(I->second);
-    }
-  }
-
-  // Visit all the instructions, bottom-up.
-  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
-    Instruction *Inst = llvm::prior(I);
-
-    // Invoke instructions are visited as part of their successors (below).
-    if (isa<InvokeInst>(Inst))
-      continue;
-
-    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
-  }
-
-  // If there's a predecessor with an invoke, visit the invoke as if it were
-  // part of this block, since we can't insert code after an invoke in its own
-  // block, and we don't want to split critical edges.
-  for (BBState::edge_iterator PI(MyStates.pred_begin()),
-       PE(MyStates.pred_end()); PI != PE; ++PI) {
-    BasicBlock *Pred = *PI;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
-      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
-  }
-
-  return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
-                                    DenseMap<Value *, RRInfo> &Releases,
-                                    BBState &MyStates) {
-  bool NestingDetected = false;
-  InstructionClass Class = GetInstructionClass(Inst);
-  const Value *Arg = 0;
-
-  switch (Class) {
-  case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
-  case IC_Retain:
-  case IC_RetainRV: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrTopDownState(Arg);
-
-    // Don't do retain+release tracking for IC_RetainRV, because it's
-    // better to let it remain as the first instruction after a call.
-    if (Class != IC_RetainRV) {
-      // If we see two retains in a row on the same pointer. If so, make
-      // a note, and we'll cicle back to revisit it after we've
-      // hopefully eliminated the second retain, which may allow us to
-      // eliminate the first retain too.
-      // Theoretically we could implement removal of nested retain+release
-      // pairs by making PtrState hold a stack of states, but this is
-      // simple and avoids adding overhead for the non-nested case.
-      if (S.GetSeq() == S_Retain)
-        NestingDetected = true;
-
-      S.ResetSequenceProgress(S_Retain);
-      S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      S.RRI.KnownSafe = S.IsKnownIncremented();
-      S.RRI.Calls.insert(Inst);
-    }
-
-    S.SetKnownPositiveRefCount();
-
-    // A retain can be a potential use; procede to the generic checking
-    // code below.
-    break;
-  }
-  case IC_Release: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.ClearRefCount();
-
-    switch (S.GetSeq()) {
-    case S_Retain:
-    case S_CanRelease:
-      S.RRI.ReverseInsertPts.clear();
-      // FALL THROUGH
-    case S_Use:
-      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-      Releases[Inst] = S.RRI;
-      S.ClearSequenceProgress();
-      break;
-    case S_None:
-      break;
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-      llvm_unreachable("top-down pointer in release state!");
-    }
-    break;
-  }
-  case IC_AutoreleasepoolPop:
-    // Conservatively, clear MyStates for all known pointers.
-    MyStates.clearTopDownPointers();
-    return NestingDetected;
-  case IC_AutoreleasepoolPush:
-  case IC_None:
-    // These are irrelevant.
-    return NestingDetected;
-  default:
-    break;
-  }
-
-  // Consider any other possible effects of this instruction on each
-  // pointer being tracked.
-  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
-       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
-    const Value *Ptr = MI->first;
-    if (Ptr == Arg)
-      continue; // Handled above.
-    PtrState &S = MI->second;
-    Sequence Seq = S.GetSeq();
-
-    // Check for possible releases.
-    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
-      switch (Seq) {
-      case S_Retain:
-        S.SetSeq(S_CanRelease);
-        assert(S.RRI.ReverseInsertPts.empty());
-        S.RRI.ReverseInsertPts.insert(Inst);
-
-        // One call can't cause a transition from S_Retain to S_CanRelease
-        // and S_CanRelease to S_Use. If we've made the first transition,
-        // we're done.
-        continue;
-      case S_Use:
-      case S_CanRelease:
-      case S_None:
-        break;
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
-        llvm_unreachable("top-down pointer in release state!");
-      }
-    }
-
-    // Check for possible direct uses.
-    switch (Seq) {
-    case S_CanRelease:
-      if (CanUse(Inst, Ptr, PA, Class))
-        S.SetSeq(S_Use);
-      break;
-    case S_Retain:
-    case S_Use:
-    case S_None:
-      break;
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-      llvm_unreachable("top-down pointer in release state!");
-    }
-  }
-
-  return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitTopDown(BasicBlock *BB,
-                         DenseMap<const BasicBlock *, BBState> &BBStates,
-                         DenseMap<Value *, RRInfo> &Releases) {
-  bool NestingDetected = false;
-  BBState &MyStates = BBStates[BB];
-
-  // Merge the states from each predecessor to compute the initial state
-  // for the current block.
-  BBState::edge_iterator PI(MyStates.pred_begin()),
-                         PE(MyStates.pred_end());
-  if (PI != PE) {
-    const BasicBlock *Pred = *PI;
-    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
-    assert(I != BBStates.end());
-    MyStates.InitFromPred(I->second);
-    ++PI;
-    for (; PI != PE; ++PI) {
-      Pred = *PI;
-      I = BBStates.find(Pred);
-      assert(I != BBStates.end());
-      MyStates.MergePred(I->second);
-    }
-  }
-
-  // Visit all the instructions, top-down.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    Instruction *Inst = I;
-    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
-  }
-
-  CheckForCFGHazards(BB, BBStates, MyStates);
-  return NestingDetected;
-}
-
-static void
-ComputePostOrders(Function &F,
-                  SmallVectorImpl<BasicBlock *> &PostOrder,
-                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
-                  unsigned NoObjCARCExceptionsMDKind,
-                  DenseMap<const BasicBlock *, BBState> &BBStates) {
-  /// Visited - The visited set, for doing DFS walks.
-  SmallPtrSet<BasicBlock *, 16> Visited;
-
-  // Do DFS, computing the PostOrder.
-  SmallPtrSet<BasicBlock *, 16> OnStack;
-  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
-
-  // Functions always have exactly one entry block, and we don't have
-  // any other block that we treat like an entry block.
-  BasicBlock *EntryBB = &F.getEntryBlock();
-  BBState &MyStates = BBStates[EntryBB];
-  MyStates.SetAsEntry();
-  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
-  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
-  Visited.insert(EntryBB);
-  OnStack.insert(EntryBB);
-  do {
-  dfs_next_succ:
-    BasicBlock *CurrBB = SuccStack.back().first;
-    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
-    succ_iterator SE(TI, false);
-
-    // If the terminator is an invoke marked with the
-    // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-    // ignored, for ARC purposes.
-    if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-      --SE;
-
-    while (SuccStack.back().second != SE) {
-      BasicBlock *SuccBB = *SuccStack.back().second++;
-      if (Visited.insert(SuccBB)) {
-        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
-        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
-        BBStates[CurrBB].addSucc(SuccBB);
-        BBState &SuccStates = BBStates[SuccBB];
-        SuccStates.addPred(CurrBB);
-        OnStack.insert(SuccBB);
-        goto dfs_next_succ;
-      }
-
-      if (!OnStack.count(SuccBB)) {
-        BBStates[CurrBB].addSucc(SuccBB);
-        BBStates[SuccBB].addPred(CurrBB);
-      }
-    }
-    OnStack.erase(CurrBB);
-    PostOrder.push_back(CurrBB);
-    SuccStack.pop_back();
-  } while (!SuccStack.empty());
-
-  Visited.clear();
-
-  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
-  // Functions may have many exits, and there also blocks which we treat
-  // as exits due to ignored edges.
-  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *ExitBB = I;
-    BBState &MyStates = BBStates[ExitBB];
-    if (!MyStates.isExit())
-      continue;
-
-    MyStates.SetAsExit();
-
-    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
-    Visited.insert(ExitBB);
-    while (!PredStack.empty()) {
-    reverse_dfs_next_succ:
-      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
-      while (PredStack.back().second != PE) {
-        BasicBlock *BB = *PredStack.back().second++;
-        if (Visited.insert(BB)) {
-          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
-          goto reverse_dfs_next_succ;
-        }
-      }
-      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
-    }
-  }
-}
-
-// Visit - Visit the function both top-down and bottom-up.
-bool
-ObjCARCOpt::Visit(Function &F,
-                  DenseMap<const BasicBlock *, BBState> &BBStates,
-                  MapVector<Value *, RRInfo> &Retains,
-                  DenseMap<Value *, RRInfo> &Releases) {
-
-  // Use reverse-postorder traversals, because we magically know that loops
-  // will be well behaved, i.e. they won't repeatedly call retain on a single
-  // pointer without doing a release. We can't use the ReversePostOrderTraversal
-  // class here because we want the reverse-CFG postorder to consider each
-  // function exit point, and we want to ignore selected cycle edges.
-  SmallVector<BasicBlock *, 16> PostOrder;
-  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
-  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
-                    NoObjCARCExceptionsMDKind,
-                    BBStates);
-
-  // Use reverse-postorder on the reverse CFG for bottom-up.
-  bool BottomUpNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
-       ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend();
-       I != E; ++I)
-    BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains);
-
-  // Use reverse-postorder for top-down.
-  bool TopDownNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
-       PostOrder.rbegin(), E = PostOrder.rend();
-       I != E; ++I)
-    TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases);
-
-  return TopDownNestingDetected && BottomUpNestingDetected;
-}
-
-/// MoveCalls - Move the calls in RetainsToMove and ReleasesToMove.
-void ObjCARCOpt::MoveCalls(Value *Arg,
-                           RRInfo &RetainsToMove,
-                           RRInfo &ReleasesToMove,
-                           MapVector<Value *, RRInfo> &Retains,
-                           DenseMap<Value *, RRInfo> &Releases,
-                           SmallVectorImpl<Instruction *> &DeadInsts,
-                           Module *M) {
-  Type *ArgTy = Arg->getType();
-  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
-
-  // Insert the new retain and release calls.
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       PI = ReleasesToMove.ReverseInsertPts.begin(),
-       PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *InsertPt = *PI;
-    Value *MyArg = ArgTy == ParamTy ? Arg :
-                   new BitCastInst(Arg, ParamTy, "", InsertPt);
-    CallInst *Call =
-      CallInst::Create(RetainsToMove.IsRetainBlock ?
-                         getRetainBlockCallee(M) : getRetainCallee(M),
-                       MyArg, "", InsertPt);
-    Call->setDoesNotThrow();
-    if (RetainsToMove.IsRetainBlock)
-      Call->setMetadata(CopyOnEscapeMDKind,
-                        MDNode::get(M->getContext(), ArrayRef<Value *>()));
-    else
-      Call->setTailCall();
-  }
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       PI = RetainsToMove.ReverseInsertPts.begin(),
-       PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *InsertPt = *PI;
-    Value *MyArg = ArgTy == ParamTy ? Arg :
-                   new BitCastInst(Arg, ParamTy, "", InsertPt);
-    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
-                                      "", InsertPt);
-    // Attach a clang.imprecise_release metadata tag, if appropriate.
-    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
-      Call->setMetadata(ImpreciseReleaseMDKind, M);
-    Call->setDoesNotThrow();
-    if (ReleasesToMove.IsTailCallRelease)
-      Call->setTailCall();
-  }
-
-  // Delete the original retain and release calls.
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       AI = RetainsToMove.Calls.begin(),
-       AE = RetainsToMove.Calls.end(); AI != AE; ++AI) {
-    Instruction *OrigRetain = *AI;
-    Retains.blot(OrigRetain);
-    DeadInsts.push_back(OrigRetain);
-  }
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       AI = ReleasesToMove.Calls.begin(),
-       AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) {
-    Instruction *OrigRelease = *AI;
-    Releases.erase(OrigRelease);
-    DeadInsts.push_back(OrigRelease);
-  }
-}
-
-/// PerformCodePlacement - Identify pairings between the retains and releases,
-/// and delete and/or move them.
-bool
-ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
-                                   &BBStates,
-                                 MapVector<Value *, RRInfo> &Retains,
-                                 DenseMap<Value *, RRInfo> &Releases,
-                                 Module *M) {
-  bool AnyPairsCompletelyEliminated = false;
-  RRInfo RetainsToMove;
-  RRInfo ReleasesToMove;
-  SmallVector<Instruction *, 4> NewRetains;
-  SmallVector<Instruction *, 4> NewReleases;
-  SmallVector<Instruction *, 8> DeadInsts;
-
-  // Visit each retain.
-  for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
-       E = Retains.end(); I != E; ++I) {
-    Value *V = I->first;
-    if (!V) continue; // blotted
-
-    Instruction *Retain = cast<Instruction>(V);
-    Value *Arg = GetObjCArg(Retain);
-
-    // If the object being released is in static or stack storage, we know it's
-    // not being managed by ObjC reference counting, so we can delete pairs
-    // regardless of what possible decrements or uses lie between them.
-    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
-
-    // A constant pointer can't be pointing to an object on the heap. It may
-    // be reference-counted, but it won't be deleted.
-    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
-      if (const GlobalVariable *GV =
-            dyn_cast<GlobalVariable>(
-              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
-        if (GV->isConstant())
-          KnownSafe = true;
-
-    // If a pair happens in a region where it is known that the reference count
-    // is already incremented, we can similarly ignore possible decrements.
-    bool KnownSafeTD = true, KnownSafeBU = true;
-
-    // Connect the dots between the top-down-collected RetainsToMove and
-    // bottom-up-collected ReleasesToMove to form sets of related calls.
-    // This is an iterative process so that we connect multiple releases
-    // to multiple retains if needed.
-    unsigned OldDelta = 0;
-    unsigned NewDelta = 0;
-    unsigned OldCount = 0;
-    unsigned NewCount = 0;
-    bool FirstRelease = true;
-    bool FirstRetain = true;
-    NewRetains.push_back(Retain);
-    for (;;) {
-      for (SmallVectorImpl<Instruction *>::const_iterator
-           NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
-        Instruction *NewRetain = *NI;
-        MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
-        assert(It != Retains.end());
-        const RRInfo &NewRetainRRI = It->second;
-        KnownSafeTD &= NewRetainRRI.KnownSafe;
-        for (SmallPtrSet<Instruction *, 2>::const_iterator
-             LI = NewRetainRRI.Calls.begin(),
-             LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
-          Instruction *NewRetainRelease = *LI;
-          DenseMap<Value *, RRInfo>::const_iterator Jt =
-            Releases.find(NewRetainRelease);
-          if (Jt == Releases.end())
-            goto next_retain;
-          const RRInfo &NewRetainReleaseRRI = Jt->second;
-          assert(NewRetainReleaseRRI.Calls.count(NewRetain));
-          if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
-            OldDelta -=
-              BBStates[NewRetainRelease->getParent()].GetAllPathCount();
-
-            // Merge the ReleaseMetadata and IsTailCallRelease values.
-            if (FirstRelease) {
-              ReleasesToMove.ReleaseMetadata =
-                NewRetainReleaseRRI.ReleaseMetadata;
-              ReleasesToMove.IsTailCallRelease =
-                NewRetainReleaseRRI.IsTailCallRelease;
-              FirstRelease = false;
-            } else {
-              if (ReleasesToMove.ReleaseMetadata !=
-                    NewRetainReleaseRRI.ReleaseMetadata)
-                ReleasesToMove.ReleaseMetadata = 0;
-              if (ReleasesToMove.IsTailCallRelease !=
-                    NewRetainReleaseRRI.IsTailCallRelease)
-                ReleasesToMove.IsTailCallRelease = false;
-            }
-
-            // Collect the optimal insertion points.
-            if (!KnownSafe)
-              for (SmallPtrSet<Instruction *, 2>::const_iterator
-                   RI = NewRetainReleaseRRI.ReverseInsertPts.begin(),
-                   RE = NewRetainReleaseRRI.ReverseInsertPts.end();
-                   RI != RE; ++RI) {
-                Instruction *RIP = *RI;
-                if (ReleasesToMove.ReverseInsertPts.insert(RIP))
-                  NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
-              }
-            NewReleases.push_back(NewRetainRelease);
-          }
-        }
-      }
-      NewRetains.clear();
-      if (NewReleases.empty()) break;
-
-      // Back the other way.
-      for (SmallVectorImpl<Instruction *>::const_iterator
-           NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
-        Instruction *NewRelease = *NI;
-        DenseMap<Value *, RRInfo>::const_iterator It =
-          Releases.find(NewRelease);
-        assert(It != Releases.end());
-        const RRInfo &NewReleaseRRI = It->second;
-        KnownSafeBU &= NewReleaseRRI.KnownSafe;
-        for (SmallPtrSet<Instruction *, 2>::const_iterator
-             LI = NewReleaseRRI.Calls.begin(),
-             LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
-          Instruction *NewReleaseRetain = *LI;
-          MapVector<Value *, RRInfo>::const_iterator Jt =
-            Retains.find(NewReleaseRetain);
-          if (Jt == Retains.end())
-            goto next_retain;
-          const RRInfo &NewReleaseRetainRRI = Jt->second;
-          assert(NewReleaseRetainRRI.Calls.count(NewRelease));
-          if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
-            unsigned PathCount =
-              BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
-            OldDelta += PathCount;
-            OldCount += PathCount;
-
-            // Merge the IsRetainBlock values.
-            if (FirstRetain) {
-              RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock;
-              FirstRetain = false;
-            } else if (ReleasesToMove.IsRetainBlock !=
-                       NewReleaseRetainRRI.IsRetainBlock)
-              // It's not possible to merge the sequences if one uses
-              // objc_retain and the other uses objc_retainBlock.
-              goto next_retain;
-
-            // Collect the optimal insertion points.
-            if (!KnownSafe)
-              for (SmallPtrSet<Instruction *, 2>::const_iterator
-                   RI = NewReleaseRetainRRI.ReverseInsertPts.begin(),
-                   RE = NewReleaseRetainRRI.ReverseInsertPts.end();
-                   RI != RE; ++RI) {
-                Instruction *RIP = *RI;
-                if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
-                  PathCount = BBStates[RIP->getParent()].GetAllPathCount();
-                  NewDelta += PathCount;
-                  NewCount += PathCount;
-                }
-              }
-            NewRetains.push_back(NewReleaseRetain);
-          }
-        }
-      }
-      NewReleases.clear();
-      if (NewRetains.empty()) break;
-    }
-
-    // If the pointer is known incremented or nested, we can safely delete the
-    // pair regardless of what's between them.
-    if (KnownSafeTD || KnownSafeBU) {
-      RetainsToMove.ReverseInsertPts.clear();
-      ReleasesToMove.ReverseInsertPts.clear();
-      NewCount = 0;
-    } else {
-      // Determine whether the new insertion points we computed preserve the
-      // balance of retain and release calls through the program.
-      // TODO: If the fully aggressive solution isn't valid, try to find a
-      // less aggressive solution which is.
-      if (NewDelta != 0)
-        goto next_retain;
-    }
-
-    // Determine whether the original call points are balanced in the retain and
-    // release calls through the program. If not, conservatively don't touch
-    // them.
-    // TODO: It's theoretically possible to do code motion in this case, as
-    // long as the existing imbalances are maintained.
-    if (OldDelta != 0)
-      goto next_retain;
-
-    // Ok, everything checks out and we're all set. Let's move some code!
-    Changed = true;
-    assert(OldCount != 0 && "Unreachable code?");
-    AnyPairsCompletelyEliminated = NewCount == 0;
-    NumRRs += OldCount - NewCount;
-    MoveCalls(Arg, RetainsToMove, ReleasesToMove,
-              Retains, Releases, DeadInsts, M);
-
-  next_retain:
-    NewReleases.clear();
-    NewRetains.clear();
-    RetainsToMove.clear();
-    ReleasesToMove.clear();
-  }
-
-  // Now that we're done moving everything, we can delete the newly dead
-  // instructions, as we no longer need them as insert points.
-  while (!DeadInsts.empty())
-    EraseInstruction(DeadInsts.pop_back_val());
-
-  return AnyPairsCompletelyEliminated;
-}
-
-/// OptimizeWeakCalls - Weak pointer optimizations.
-void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
-  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
-  // itself because it uses AliasAnalysis and we need to do provenance
-  // queries instead.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst <<
-          "\n");
-
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
-      continue;
-
-    // Delete objc_loadWeak calls with no users.
-    if (Class == IC_LoadWeak && Inst->use_empty()) {
-      Inst->eraseFromParent();
-      continue;
-    }
-
-    // TODO: For now, just look for an earlier available version of this value
-    // within the same block. Theoretically, we could do memdep-style non-local
-    // analysis too, but that would want caching. A better approach would be to
-    // use the technique that EarlyCSE uses.
-    inst_iterator Current = llvm::prior(I);
-    BasicBlock *CurrentBB = Current.getBasicBlockIterator();
-    for (BasicBlock::iterator B = CurrentBB->begin(),
-                              J = Current.getInstructionIterator();
-         J != B; --J) {
-      Instruction *EarlierInst = &*llvm::prior(J);
-      InstructionClass EarlierClass = GetInstructionClass(EarlierInst);
-      switch (EarlierClass) {
-      case IC_LoadWeak:
-      case IC_LoadWeakRetained: {
-        // If this is loading from the same pointer, replace this load's value
-        // with that one.
-        CallInst *Call = cast<CallInst>(Inst);
-        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
-        Value *Arg = Call->getArgOperand(0);
-        Value *EarlierArg = EarlierCall->getArgOperand(0);
-        switch (PA.getAA()->alias(Arg, EarlierArg)) {
-        case AliasAnalysis::MustAlias:
-          Changed = true;
-          // If the load has a builtin retain, insert a plain retain for it.
-          if (Class == IC_LoadWeakRetained) {
-            CallInst *CI =
-              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
-                               "", Call);
-            CI->setTailCall();
-          }
-          // Zap the fully redundant load.
-          Call->replaceAllUsesWith(EarlierCall);
-          Call->eraseFromParent();
-          goto clobbered;
-        case AliasAnalysis::MayAlias:
-        case AliasAnalysis::PartialAlias:
-          goto clobbered;
-        case AliasAnalysis::NoAlias:
-          break;
-        }
-        break;
-      }
-      case IC_StoreWeak:
-      case IC_InitWeak: {
-        // If this is storing to the same pointer and has the same size etc.
-        // replace this load's value with the stored value.
-        CallInst *Call = cast<CallInst>(Inst);
-        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
-        Value *Arg = Call->getArgOperand(0);
-        Value *EarlierArg = EarlierCall->getArgOperand(0);
-        switch (PA.getAA()->alias(Arg, EarlierArg)) {
-        case AliasAnalysis::MustAlias:
-          Changed = true;
-          // If the load has a builtin retain, insert a plain retain for it.
-          if (Class == IC_LoadWeakRetained) {
-            CallInst *CI =
-              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
-                               "", Call);
-            CI->setTailCall();
-          }
-          // Zap the fully redundant load.
-          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
-          Call->eraseFromParent();
-          goto clobbered;
-        case AliasAnalysis::MayAlias:
-        case AliasAnalysis::PartialAlias:
-          goto clobbered;
-        case AliasAnalysis::NoAlias:
-          break;
-        }
-        break;
-      }
-      case IC_MoveWeak:
-      case IC_CopyWeak:
-        // TOOD: Grab the copied value.
-        goto clobbered;
-      case IC_AutoreleasepoolPush:
-      case IC_None:
-      case IC_User:
-        // Weak pointers are only modified through the weak entry points
-        // (and arbitrary calls, which could call the weak entry points).
-        break;
-      default:
-        // Anything else could modify the weak pointer.
-        goto clobbered;
-      }
-    }
-  clobbered:;
-  }
-
-  // Then, for each destroyWeak with an alloca operand, check to see if
-  // the alloca and all its users can be zapped.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    if (Class != IC_DestroyWeak)
-      continue;
-
-    CallInst *Call = cast<CallInst>(Inst);
-    Value *Arg = Call->getArgOperand(0);
-    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
-      for (Value::use_iterator UI = Alloca->use_begin(),
-           UE = Alloca->use_end(); UI != UE; ++UI) {
-        const Instruction *UserInst = cast<Instruction>(*UI);
-        switch (GetBasicInstructionClass(UserInst)) {
-        case IC_InitWeak:
-        case IC_StoreWeak:
-        case IC_DestroyWeak:
-          continue;
-        default:
-          goto done;
-        }
-      }
-      Changed = true;
-      for (Value::use_iterator UI = Alloca->use_begin(),
-           UE = Alloca->use_end(); UI != UE; ) {
-        CallInst *UserInst = cast<CallInst>(*UI++);
-        switch (GetBasicInstructionClass(UserInst)) {
-        case IC_InitWeak:
-        case IC_StoreWeak:
-          // These functions return their second argument.
-          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
-          break;
-        case IC_DestroyWeak:
-          // No return value.
-          break;
-        default:
-          llvm_unreachable("alloca really is used!");
-        }
-        UserInst->eraseFromParent();
-      }
-      Alloca->eraseFromParent();
-    done:;
-    }
-  }
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n");
-
-}
-
-/// OptimizeSequences - Identify program paths which execute sequences of
-/// retains and releases which can be eliminated.
-bool ObjCARCOpt::OptimizeSequences(Function &F) {
-  /// Releases, Retains - These are used to store the results of the main flow
-  /// analysis. These use Value* as the key instead of Instruction* so that the
-  /// map stays valid when we get around to rewriting code and calls get
-  /// replaced by arguments.
-  DenseMap<Value *, RRInfo> Releases;
-  MapVector<Value *, RRInfo> Retains;
-
-  /// BBStates, This is used during the traversal of the function to track the
-  /// states for each identified object at each block.
-  DenseMap<const BasicBlock *, BBState> BBStates;
-
-  // Analyze the CFG of the function, and all instructions.
-  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
-
-  // Transform.
-  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
-         NestingDetected;
-}
-
-/// OptimizeReturns - Look for this pattern:
-/// \code
-///    %call = call i8* @something(...)
-///    %2 = call i8* @objc_retain(i8* %call)
-///    %3 = call i8* @objc_autorelease(i8* %2)
-///    ret i8* %3
-/// \endcode
-/// And delete the retain and autorelease.
-///
-/// Otherwise if it's just this:
-/// \code
-///    %3 = call i8* @objc_autorelease(i8* %2)
-///    ret i8* %3
-/// \endcode
-/// convert the autorelease to autoreleaseRV.
-void ObjCARCOpt::OptimizeReturns(Function &F) {
-  if (!F.getReturnType()->isPointerTy())
-    return;
-
-  SmallPtrSet<Instruction *, 4> DependingInstructions;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
-    BasicBlock *BB = FI;
-    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
-
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n");
-
-    if (!Ret) continue;
-
-    const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
-    FindDependencies(NeedsPositiveRetainCount, Arg,
-                     BB, Ret, DependingInstructions, Visited, PA);
-    if (DependingInstructions.size() != 1)
-      goto next_block;
-
-    {
-      CallInst *Autorelease =
-        dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-      if (!Autorelease)
-        goto next_block;
-      InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
-      if (!IsAutorelease(AutoreleaseClass))
-        goto next_block;
-      if (GetObjCArg(Autorelease) != Arg)
-        goto next_block;
-
-      DependingInstructions.clear();
-      Visited.clear();
-
-      // Check that there is nothing that can affect the reference
-      // count between the autorelease and the retain.
-      FindDependencies(CanChangeRetainCount, Arg,
-                       BB, Autorelease, DependingInstructions, Visited, PA);
-      if (DependingInstructions.size() != 1)
-        goto next_block;
-
-      {
-        CallInst *Retain =
-          dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-
-        // Check that we found a retain with the same argument.
-        if (!Retain ||
-            !IsRetain(GetBasicInstructionClass(Retain)) ||
-            GetObjCArg(Retain) != Arg)
-          goto next_block;
-
-        DependingInstructions.clear();
-        Visited.clear();
-
-        // Convert the autorelease to an autoreleaseRV, since it's
-        // returning the value.
-        if (AutoreleaseClass == IC_Autorelease) {
-          Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent()));
-          AutoreleaseClass = IC_AutoreleaseRV;
-        }
-
-        // Check that there is nothing that can affect the reference
-        // count between the retain and the call.
-        // Note that Retain need not be in BB.
-        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
-                         DependingInstructions, Visited, PA);
-        if (DependingInstructions.size() != 1)
-          goto next_block;
-
-        {
-          CallInst *Call =
-            dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-
-          // Check that the pointer is the return value of the call.
-          if (!Call || Arg != Call)
-            goto next_block;
-
-          // Check that the call is a regular call.
-          InstructionClass Class = GetBasicInstructionClass(Call);
-          if (Class != IC_CallOrUser && Class != IC_Call)
-            goto next_block;
-
-          // If so, we can zap the retain and autorelease.
-          Changed = true;
-          ++NumRets;
-          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain
-                       << "\n                             Erasing: "
-                       << *Autorelease << "\n");
-          EraseInstruction(Retain);
-          EraseInstruction(Autorelease);
-        }
-      }
-    }
-
-  next_block:
-    DependingInstructions.clear();
-    Visited.clear();
-  }
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n");
-
-}
-
-bool ObjCARCOpt::doInitialization(Module &M) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  Run = ModuleHasARC(M);
-  if (!Run)
-    return false;
-
-  // Identify the imprecise release metadata kind.
-  ImpreciseReleaseMDKind =
-    M.getContext().getMDKindID("clang.imprecise_release");
-  CopyOnEscapeMDKind =
-    M.getContext().getMDKindID("clang.arc.copy_on_escape");
-  NoObjCARCExceptionsMDKind =
-    M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
-
-  // Intuitively, objc_retain and others are nocapture, however in practice
-  // they are not, because they return their argument value. And objc_release
-  // calls finalizers which can have arbitrary side effects.
-
-  // These are initialized lazily.
-  RetainRVCallee = 0;
-  AutoreleaseRVCallee = 0;
-  ReleaseCallee = 0;
-  RetainCallee = 0;
-  RetainBlockCallee = 0;
-  AutoreleaseCallee = 0;
-
-  return false;
-}
-
-bool ObjCARCOpt::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  Changed = false;
-
-  PA.setAA(&getAnalysis<AliasAnalysis>());
-
-  // This pass performs several distinct transformations. As a compile-time aid
-  // when compiling code that isn't ObjC, skip these if the relevant ObjC
-  // library functions aren't declared.
-
-  // Preliminary optimizations. This also computs UsedInThisFunction.
-  OptimizeIndividualCalls(F);
-
-  // Optimizations for weak pointers.
-  if (UsedInThisFunction & ((1 << IC_LoadWeak) |
-                            (1 << IC_LoadWeakRetained) |
-                            (1 << IC_StoreWeak) |
-                            (1 << IC_InitWeak) |
-                            (1 << IC_CopyWeak) |
-                            (1 << IC_MoveWeak) |
-                            (1 << IC_DestroyWeak)))
-    OptimizeWeakCalls(F);
-
-  // Optimizations for retain+release pairs.
-  if (UsedInThisFunction & ((1 << IC_Retain) |
-                            (1 << IC_RetainRV) |
-                            (1 << IC_RetainBlock)))
-    if (UsedInThisFunction & (1 << IC_Release))
-      // Run OptimizeSequences until it either stops making changes or
-      // no retain+release pair nesting is detected.
-      while (OptimizeSequences(F)) {}
-
-  // Optimizations if objc_autorelease is used.
-  if (UsedInThisFunction & ((1 << IC_Autorelease) |
-                            (1 << IC_AutoreleaseRV)))
-    OptimizeReturns(F);
-
-  return Changed;
-}
-
-void ObjCARCOpt::releaseMemory() {
-  PA.clear();
-}
-
-//===----------------------------------------------------------------------===//
-// ARC contraction.
-//===----------------------------------------------------------------------===//
-
-// TODO: ObjCARCContract could insert PHI nodes when uses aren't
-// dominated by single calls.
-
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Operator.h"
-
-STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
-
-namespace {
-  /// ObjCARCContract - Late ARC optimizations.  These change the IR in a way
-  /// that makes it difficult to be analyzed by ObjCARCOpt, so it's run late.
-  class ObjCARCContract : public FunctionPass {
-    bool Changed;
-    AliasAnalysis *AA;
-    DominatorTree *DT;
-    ProvenanceAnalysis PA;
-
-    /// Run - A flag indicating whether this optimization pass should run.
-    bool Run;
-
-    /// StoreStrongCallee, etc. - Declarations for ObjC runtime
-    /// functions, for use in creating calls to them. These are initialized
-    /// lazily to avoid cluttering up the Module with unused declarations.
-    Constant *StoreStrongCallee,
-             *RetainAutoreleaseCallee, *RetainAutoreleaseRVCallee;
-
-    /// RetainRVMarker - The inline asm string to insert between calls and
-    /// RetainRV calls to make the optimization work on targets which need it.
-    const MDString *RetainRVMarker;
-
-    /// StoreStrongCalls - The set of inserted objc_storeStrong calls. If
-    /// at the end of walking the function we have found no alloca
-    /// instructions, these calls can be marked "tail".
-    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
-
-    Constant *getStoreStrongCallee(Module *M);
-    Constant *getRetainAutoreleaseCallee(Module *M);
-    Constant *getRetainAutoreleaseRVCallee(Module *M);
-
-    bool ContractAutorelease(Function &F, Instruction *Autorelease,
-                             InstructionClass Class,
-                             SmallPtrSet<Instruction *, 4>
-                               &DependingInstructions,
-                             SmallPtrSet<const BasicBlock *, 4>
-                               &Visited);
-
-    void ContractRelease(Instruction *Release,
-                         inst_iterator &Iter);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-
-  public:
-    static char ID;
-    ObjCARCContract() : FunctionPass(ID) {
-      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCContract::ID = 0;
-INITIALIZE_PASS_BEGIN(ObjCARCContract,
-                      "objc-arc-contract", "ObjC ARC contraction", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
-INITIALIZE_PASS_END(ObjCARCContract,
-                    "objc-arc-contract", "ObjC ARC contraction", false, false)
-
-Pass *llvm::createObjCARCContractPass() {
-  return new ObjCARCContract();
-}
-
-void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AliasAnalysis>();
-  AU.addRequired<DominatorTree>();
-  AU.setPreservesCFG();
-}
-
-Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
-  if (!StoreStrongCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *I8XX = PointerType::getUnqual(I8X);
-    Type *Params[] = { I8XX, I8X };
-
-    AttributeSet Attribute = AttributeSet()
-      .addAttr(M->getContext(), AttributeSet::FunctionIndex,
-               Attribute::get(C, Attribute::NoUnwind))
-      .addAttr(M->getContext(), 1, Attribute::get(C, Attribute::NoCapture));
-
-    StoreStrongCallee =
-      M->getOrInsertFunction(
-        "objc_storeStrong",
-        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return StoreStrongCallee;
-}
-
-Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
-  if (!RetainAutoreleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    RetainAutoreleaseCallee =
-      M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute);
-  }
-  return RetainAutoreleaseCallee;
-}
-
-Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
-  if (!RetainAutoreleaseRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    RetainAutoreleaseRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
-                             Attribute);
-  }
-  return RetainAutoreleaseRVCallee;
-}
-
-/// ContractAutorelease - Merge an autorelease with a retain into a fused call.
-bool
-ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
-                                     InstructionClass Class,
-                                     SmallPtrSet<Instruction *, 4>
-                                       &DependingInstructions,
-                                     SmallPtrSet<const BasicBlock *, 4>
-                                       &Visited) {
-  const Value *Arg = GetObjCArg(Autorelease);
-
-  // Check that there are no instructions between the retain and the autorelease
-  // (such as an autorelease_pop) which may change the count.
-  CallInst *Retain = 0;
-  if (Class == IC_AutoreleaseRV)
-    FindDependencies(RetainAutoreleaseRVDep, Arg,
-                     Autorelease->getParent(), Autorelease,
-                     DependingInstructions, Visited, PA);
-  else
-    FindDependencies(RetainAutoreleaseDep, Arg,
-                     Autorelease->getParent(), Autorelease,
-                     DependingInstructions, Visited, PA);
-
-  Visited.clear();
-  if (DependingInstructions.size() != 1) {
-    DependingInstructions.clear();
-    return false;
-  }
-
-  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-  DependingInstructions.clear();
-
-  if (!Retain ||
-      GetBasicInstructionClass(Retain) != IC_Retain ||
-      GetObjCArg(Retain) != Arg)
-    return false;
-
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing "
-                  "retain/autorelease. Erasing: " << *Autorelease << "\n"
-                  "                                      Old Retain: "
-               << *Retain << "\n");
-
-  if (Class == IC_AutoreleaseRV)
-    Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent()));
-  else
-    Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                      New Retain: "
-               << *Retain << "\n");
-
-  EraseInstruction(Autorelease);
-  return true;
-}
-
-/// ContractRelease - Attempt to merge an objc_release with a store, load, and
-/// objc_retain to form an objc_storeStrong. This can be a little tricky because
-/// the instructions don't always appear in order, and there may be unrelated
-/// intervening instructions.
-void ObjCARCContract::ContractRelease(Instruction *Release,
-                                      inst_iterator &Iter) {
-  LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
-  if (!Load || !Load->isSimple()) return;
-
-  // For now, require everything to be in one basic block.
-  BasicBlock *BB = Release->getParent();
-  if (Load->getParent() != BB) return;
-
-  // Walk down to find the store and the release, which may be in either order.
-  BasicBlock::iterator I = Load, End = BB->end();
-  ++I;
-  AliasAnalysis::Location Loc = AA->getLocation(Load);
-  StoreInst *Store = 0;
-  bool SawRelease = false;
-  for (; !Store || !SawRelease; ++I) {
-    if (I == End)
-      return;
-
-    Instruction *Inst = I;
-    if (Inst == Release) {
-      SawRelease = true;
-      continue;
-    }
-
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-
-    // Unrelated retains are harmless.
-    if (IsRetain(Class))
-      continue;
-
-    if (Store) {
-      // The store is the point where we're going to put the objc_storeStrong,
-      // so make sure there are no uses after it.
-      if (CanUse(Inst, Load, PA, Class))
-        return;
-    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
-      // We are moving the load down to the store, so check for anything
-      // else which writes to the memory between the load and the store.
-      Store = dyn_cast<StoreInst>(Inst);
-      if (!Store || !Store->isSimple()) return;
-      if (Store->getPointerOperand() != Loc.Ptr) return;
-    }
-  }
-
-  Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
-
-  // Walk up to find the retain.
-  I = Store;
-  BasicBlock::iterator Begin = BB->begin();
-  while (I != Begin && GetBasicInstructionClass(I) != IC_Retain)
-    --I;
-  Instruction *Retain = I;
-  if (GetBasicInstructionClass(Retain) != IC_Retain) return;
-  if (GetObjCArg(Retain) != New) return;
-
-  Changed = true;
-  ++NumStoreStrongs;
-
-  LLVMContext &C = Release->getContext();
-  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-  Type *I8XX = PointerType::getUnqual(I8X);
-
-  Value *Args[] = { Load->getPointerOperand(), New };
-  if (Args[0]->getType() != I8XX)
-    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
-  if (Args[1]->getType() != I8X)
-    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
-  CallInst *StoreStrong =
-    CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()),
-                     Args, "", Store);
-  StoreStrong->setDoesNotThrow();
-  StoreStrong->setDebugLoc(Store->getDebugLoc());
-
-  // We can't set the tail flag yet, because we haven't yet determined
-  // whether there are any escaping allocas. Remember this call, so that
-  // we can set the tail flag once we know it's safe.
-  StoreStrongCalls.insert(StoreStrong);
-
-  if (&*Iter == Store) ++Iter;
-  Store->eraseFromParent();
-  Release->eraseFromParent();
-  EraseInstruction(Retain);
-  if (Load->use_empty())
-    Load->eraseFromParent();
-}
-
-bool ObjCARCContract::doInitialization(Module &M) {
-  // If nothing in the Module uses ARC, don't do anything.
-  Run = ModuleHasARC(M);
-  if (!Run)
-    return false;
-
-  // These are initialized lazily.
-  StoreStrongCallee = 0;
-  RetainAutoreleaseCallee = 0;
-  RetainAutoreleaseRVCallee = 0;
-
-  // Initialize RetainRVMarker.
-  RetainRVMarker = 0;
-  if (NamedMDNode *NMD =
-        M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
-    if (NMD->getNumOperands() == 1) {
-      const MDNode *N = NMD->getOperand(0);
-      if (N->getNumOperands() == 1)
-        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
-          RetainRVMarker = S;
-    }
-
-  return false;
-}
-
-bool ObjCARCContract::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  Changed = false;
-  AA = &getAnalysis<AliasAnalysis>();
-  DT = &getAnalysis<DominatorTree>();
-
-  PA.setAA(&getAnalysis<AliasAnalysis>());
-
-  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
-  // keyword. Be conservative if the function has variadic arguments.
-  // It seems that functions which "return twice" are also unsafe for the
-  // "tail" argument, because they are setjmp, which could need to
-  // return to an earlier stack state.
-  bool TailOkForStoreStrongs = !F.isVarArg() &&
-                               !F.callsFunctionThatReturnsTwice();
-
-  // For ObjC library calls which return their argument, replace uses of the
-  // argument with uses of the call return value, if it dominates the use. This
-  // reduces register pressure.
-  SmallPtrSet<Instruction *, 4> DependingInstructions;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-
-    DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n");
-
-    // Only these library routines return their argument. In particular,
-    // objc_retainBlock does not necessarily return its argument.
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    switch (Class) {
-    case IC_Retain:
-    case IC_FusedRetainAutorelease:
-    case IC_FusedRetainAutoreleaseRV:
-      break;
-    case IC_Autorelease:
-    case IC_AutoreleaseRV:
-      if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
-        continue;
-      break;
-    case IC_RetainRV: {
-      // If we're compiling for a target which needs a special inline-asm
-      // marker to do the retainAutoreleasedReturnValue optimization,
-      // insert it now.
-      if (!RetainRVMarker)
-        break;
-      BasicBlock::iterator BBI = Inst;
-      BasicBlock *InstParent = Inst->getParent();
-
-      // Step up to see if the call immediately precedes the RetainRV call.
-      // If it's an invoke, we have to cross a block boundary. And we have
-      // to carefully dodge no-op instructions.
-      do {
-        if (&*BBI == InstParent->begin()) {
-          BasicBlock *Pred = InstParent->getSinglePredecessor();
-          if (!Pred)
-            goto decline_rv_optimization;
-          BBI = Pred->getTerminator();
-          break;
-        }
-        --BBI;
-      } while (isNoopInstruction(BBI));
-
-      if (&*BBI == GetObjCArg(Inst)) {
-        DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for "
-                        "retainAutoreleasedReturnValue optimization.\n");
-        Changed = true;
-        InlineAsm *IA =
-          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
-                                           /*isVarArg=*/false),
-                         RetainRVMarker->getString(),
-                         /*Constraints=*/"", /*hasSideEffects=*/true);
-        CallInst::Create(IA, "", Inst);
-      }
-    decline_rv_optimization:
-      break;
-    }
-    case IC_InitWeak: {
-      // objc_initWeak(p, null) => *p = null
-      CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(1))) {
-        Value *Null =
-          ConstantPointerNull::get(cast<PointerType>(CI->getType()));
-        Changed = true;
-        new StoreInst(Null, CI->getArgOperand(0), CI);
-
-        DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
-                     << "                 New = " << *Null << "\n");
-
-        CI->replaceAllUsesWith(Null);
-        CI->eraseFromParent();
-      }
-      continue;
-    }
-    case IC_Release:
-      ContractRelease(Inst, I);
-      continue;
-    case IC_User:
-      // Be conservative if the function has any alloca instructions.
-      // Technically we only care about escaping alloca instructions,
-      // but this is sufficient to handle some interesting cases.
-      if (isa<AllocaInst>(Inst))
-        TailOkForStoreStrongs = false;
-      continue;
-    default:
-      continue;
-    }
-
-    DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n");
-
-    // Don't use GetObjCArg because we don't want to look through bitcasts
-    // and such; to do the replacement, the argument must have type i8*.
-    const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
-    for (;;) {
-      // If we're compiling bugpointed code, don't get in trouble.
-      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
-        break;
-      // Look through the uses of the pointer.
-      for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
-           UI != UE; ) {
-        Use &U = UI.getUse();
-        unsigned OperandNo = UI.getOperandNo();
-        ++UI; // Increment UI now, because we may unlink its element.
-
-        // If the call's return value dominates a use of the call's argument
-        // value, rewrite the use to use the return value. We check for
-        // reachability here because an unreachable call is considered to
-        // trivially dominate itself, which would lead us to rewriting its
-        // argument in terms of its return value, which would lead to
-        // infinite loops in GetObjCArg.
-        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
-          Changed = true;
-          Instruction *Replacement = Inst;
-          Type *UseTy = U.get()->getType();
-          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
-            // For PHI nodes, insert the bitcast in the predecessor block.
-            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
-            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
-            if (Replacement->getType() != UseTy)
-              Replacement = new BitCastInst(Replacement, UseTy, "",
-                                            &BB->back());
-            // While we're here, rewrite all edges for this PHI, rather
-            // than just one use at a time, to minimize the number of
-            // bitcasts we emit.
-            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
-              if (PHI->getIncomingBlock(i) == BB) {
-                // Keep the UI iterator valid.
-                if (&PHI->getOperandUse(
-                      PHINode::getOperandNumForIncomingValue(i)) ==
-                    &UI.getUse())
-                  ++UI;
-                PHI->setIncomingValue(i, Replacement);
-              }
-          } else {
-            if (Replacement->getType() != UseTy)
-              Replacement = new BitCastInst(Replacement, UseTy, "",
-                                            cast<Instruction>(U.getUser()));
-            U.set(Replacement);
-          }
-        }
-      }
-
-      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
-      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
-        Arg = BI->getOperand(0);
-      else if (isa<GEPOperator>(Arg) &&
-               cast<GEPOperator>(Arg)->hasAllZeroIndices())
-        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
-      else if (isa<GlobalAlias>(Arg) &&
-               !cast<GlobalAlias>(Arg)->mayBeOverridden())
-        Arg = cast<GlobalAlias>(Arg)->getAliasee();
-      else
-        break;
-    }
-  }
-
-  // If this function has no escaping allocas or suspicious vararg usage,
-  // objc_storeStrong calls can be marked with the "tail" keyword.
-  if (TailOkForStoreStrongs)
-    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
-         E = StoreStrongCalls.end(); I != E; ++I)
-      (*I)->setTailCall();
-  StoreStrongCalls.clear();
-
-  return Changed;
-}
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 3e935d8..e30a274 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -271,13 +271,6 @@ public:
     return I->second;
   }
 
-  /*LatticeVal getStructLatticeValueFor(Value *V, unsigned i) const {
-    DenseMap<std::pair<Value*, unsigned>, LatticeVal>::const_iterator I =
-      StructValueState.find(std::make_pair(V, i));
-    assert(I != StructValueState.end() && "V is not in valuemap!");
-    return I->second;
-  }*/
-
   /// getTrackedRetVals - Get the inferred return value map.
   ///
   const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
@@ -710,9 +703,6 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
     markConstant(&PN, OperandVal);      // Acquire operand value
 }
 
-
-
-
 void SCCPSolver::visitReturnInst(ReturnInst &I) {
   if (I.getNumOperands() == 0) return;  // ret void
 
@@ -1185,7 +1175,7 @@ void SCCPSolver::Solve() {
       DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
 
       // "I" got into the work list because it either made the transition from
-      // bottom to constant
+      // bottom to constant, or to overdefined.
       //
       // Anything on this worklist that is overdefined need not be visited
       // since all of its users will have already been marked as overdefined
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 4204171..e90fe90 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -43,14 +43,12 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/InstVisitor.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -411,9 +409,9 @@ static Value *foldSelectInst(SelectInst &SI) {
   // early on.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
     return SI.getOperand(1+CI->isZero());
-  if (SI.getOperand(1) == SI.getOperand(2)) {
+  if (SI.getOperand(1) == SI.getOperand(2))
     return SI.getOperand(1);
-  }
+
   return 0;
 }
 
@@ -621,7 +619,7 @@ private:
   }
 
   // Disable SRoA for any intrinsics except for lifetime invariants.
-  // FIXME: What about debug instrinsics? This matches old behavior, but
+  // FIXME: What about debug intrinsics? This matches old behavior, but
   // doesn't make sense.
   void visitIntrinsicInst(IntrinsicInst &II) {
     if (!IsOffsetKnown)
@@ -1141,8 +1139,7 @@ void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
 
 void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
                                     StringRef Indent) const {
-  for (const_use_iterator UI = use_begin(I), UE = use_end(I);
-       UI != UE; ++UI) {
+  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
     if (!UI->U)
       continue; // Skip dead uses.
     OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
@@ -1170,8 +1167,7 @@ void AllocaPartitioning::print(raw_ostream &OS) const {
   }
 
   OS << "Partitioning of alloca: " << AI << "\n";
-  unsigned Num = 0;
-  for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
     print(OS, I);
     printUsers(OS, I);
   }
@@ -1242,7 +1238,7 @@ public:
     for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
-      Value *Arg = NULL;
+      Value *Arg = 0;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         // If an argument is zero extended then use argument directly. The ZExt
         // may be zapped by an optimization pass in future.
@@ -1277,7 +1273,7 @@ namespace {
 /// 1) It takes allocations of aggregates and analyzes the ways in which they
 ///    are used to try to split them into smaller allocations, ideally of
 ///    a single scalar data type. It will split up memcpy and memset accesses
-///    as necessary and try to isolate invidual scalar accesses.
+///    as necessary and try to isolate individual scalar accesses.
 /// 2) It will transform accesses into forms which are suitable for SSA value
 ///    promotion. This can be replacing a memset with a scalar store of an
 ///    integer value, or it can involve speculating operations on a PHI or
@@ -1439,8 +1435,7 @@ private:
     // We can only transform this if it is safe to push the loads into the
     // predecessor blocks. The only thing to watch out for is that we can't put
     // a possibly trapping load in the predecessor if it is a critical edge.
-    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
-         ++Idx) {
+    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
       TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
       Value *InVal = PN.getIncomingValue(Idx);
 
@@ -1483,7 +1478,7 @@ private:
                                           PN.getName() + ".sroa.speculated");
 
     // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
-    // matter which one we get and if any differ, it doesn't matter.
+    // matter which one we get and if any differ.
     LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
     MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
     unsigned Align = SomeLoad->getAlignment();
@@ -1816,7 +1811,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
 /// The strategy for finding the more natural GEPs is to peel off layers of the
 /// pointer, walking back through bit casts and GEPs, searching for a base
 /// pointer from which we can compute a natural GEP with the desired
-/// properities. The algorithm tries to fold as many constant indices into
+/// properties. The algorithm tries to fold as many constant indices into
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
 static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
@@ -2062,9 +2057,9 @@ static bool isIntegerWideningViable(const DataLayout &TD,
 
   uint64_t Size = TD.getTypeStoreSize(AllocaTy);
 
-  // Check the uses to ensure the uses are (likely) promoteable integer uses.
+  // Check the uses to ensure the uses are (likely) promotable integer uses.
   // Also ensure that the alloca has a covering load or store. We don't want
-  // to widen the integer operotains only to fail to promote due to some other
+  // to widen the integer operations only to fail to promote due to some other
   // unsplittable entry (which we may make splittable later).
   bool WholeAllocaOp = false;
   for (; I != E; ++I) {
@@ -2283,7 +2278,7 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
 
   // If we are rewriting an alloca partition which can be written as pure
   // vector operations, we stash extra information here. When VecTy is
-  // non-null, we have some strict guarantees about the rewriten alloca:
+  // non-null, we have some strict guarantees about the rewritten alloca:
   //   - The new alloca is exactly the size of the vector type here.
   //   - The accesses all either map to the entire vector or to a single
   //     element.
@@ -2636,7 +2631,7 @@ private:
   ///
   /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
   /// call this routine.
-  /// FIXME: Heed the abvice above.
+  /// FIXME: Heed the advice above.
   ///
   /// \param V The i8 value to splat.
   /// \param Size The number of bytes in the output (assuming i8 is one byte)
@@ -2971,6 +2966,7 @@ private:
     else
       New = IRB.CreateLifetimeEnd(Ptr, Size);
 
+    (void)New;
     DEBUG(dbgs() << "          to: " << *New << "\n");
     return true;
   }
@@ -3147,9 +3143,8 @@ private:
     void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
       assert(Ty->isSingleValueType());
       // Load the single value and insert it using the indices.
-      Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices,
-                                                         Name + ".gep"),
-                                   Name + ".load");
+      Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep");
+      Value *Load = IRB.CreateLoad(GEP, Name + ".load");
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
       DEBUG(dbgs() << "          to: " << *Load << "\n");
     }
@@ -3422,7 +3417,7 @@ bool SROA::rewriteAllocaPartition(AllocaInst &AI,
   // Check for the case where we're going to rewrite to a new alloca of the
   // exact same type as the original, and with the same access offsets. In that
   // case, re-use the existing alloca, but still run through the rewriter to
-  // performe phi and select speculation.
+  // perform phi and select speculation.
   AllocaInst *NewAI;
   if (AllocaTy == AI.getAllocatedType()) {
     assert(PI->BeginOffset == 0 &&
@@ -3589,7 +3584,7 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
 /// If there is a domtree available, we attempt to promote using the full power
 /// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
 /// based on the SSAUpdater utilities. This function returns whether any
-/// promotion occured.
+/// promotion occurred.
 bool SROA::promoteAllocas(Function &F) {
   if (PromotableAllocas.empty())
     return false;
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 35d2fa0..8a9c7da 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -50,11 +50,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLowerAtomicPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeMemCpyOptPass(Registry);
-  initializeObjCARCAliasAnalysisPass(Registry);
-  initializeObjCARCAPElimPass(Registry);
-  initializeObjCARCExpandPass(Registry);
-  initializeObjCARCContractPass(Registry);
-  initializeObjCARCOptPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index d5cefa3..916b37d 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -165,7 +165,7 @@ bool SimplifyLibCalls::runOnFunction(Function &F) {
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
       // Ignore non-calls.
       CallInst *CI = dyn_cast<CallInst>(I++);
-      if (!CI) continue;
+      if (!CI || CI->hasFnAttr(Attribute::NoBuiltin)) continue;
 
       // Ignore indirect calls and calls to non-external functions.
       Function *Callee = CI->getCalledFunction();
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 6572e09..2002e68 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -58,6 +58,7 @@
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -79,11 +80,15 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 
 namespace {
   struct TailCallElim : public FunctionPass {
+    const TargetTransformInfo *TTI;
+
     static char ID; // Pass identification, replacement for typeid
     TailCallElim() : FunctionPass(ID) {
       initializeTailCallElimPass(*PassRegistry::getPassRegistry());
     }
 
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
     virtual bool runOnFunction(Function &F);
 
   private:
@@ -109,14 +114,21 @@ namespace {
 }
 
 char TailCallElim::ID = 0;
-INITIALIZE_PASS(TailCallElim, "tailcallelim",
-                "Tail Call Elimination", false, false)
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim",
+                      "Tail Call Elimination", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim",
+                    "Tail Call Elimination", false, false)
 
 // Public interface to the TailCallElimination pass
 FunctionPass *llvm::createTailCallEliminationPass() {
   return new TailCallElim();
 }
 
+void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfo>();
+}
+
 /// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
 /// callees of this function.  We only do very simple analysis right now, this
 /// could be expanded in the future to use mod/ref information for particular
@@ -151,6 +163,7 @@ bool TailCallElim::runOnFunction(Function &F) {
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg()) return false;
 
+  TTI = &getAnalysis<TargetTransformInfo>();
   BasicBlock *OldEntry = 0;
   bool TailCallsAreMarkedTail = false;
   SmallVector<PHINode*, 8> ArgumentPHIs;
@@ -391,7 +404,8 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   if (BB == &F->getEntryBlock() &&
       FirstNonDbg(BB->front()) == CI &&
       FirstNonDbg(llvm::next(BB->begin())) == TI &&
-      callIsSmall(CI)) {
+      CI->getCalledFunction() &&
+      !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
     CallSite::arg_iterator I = CallSite(CI).arg_begin(),
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 8330e84..ba99d2e 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -37,12 +37,12 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
          // Can delete self loop.
          BB->getSinglePredecessor() == BB) && "Block is not dead!");
   TerminatorInst *BBTerm = BB->getTerminator();
-  
+
   // Loop through all of our successors and make sure they know that one
   // of their predecessors is going away.
   for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i)
     BBTerm->getSuccessor(i)->removePredecessor(BB);
-  
+
   // Zap all the instructions in the block.
   while (!BB->empty()) {
     Instruction &I = BB->back();
@@ -55,7 +55,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
       I.replaceAllUsesWith(UndefValue::get(I.getType()));
     BB->getInstList().pop_back();
   }
-  
+
   // Zap the block!
   BB->eraseFromParent();
 }
@@ -66,25 +66,25 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
 /// when the block has exactly one predecessor.
 void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) {
   if (!isa<PHINode>(BB->begin())) return;
-  
+
   AliasAnalysis *AA = 0;
   MemoryDependenceAnalysis *MemDep = 0;
   if (P) {
     AA = P->getAnalysisIfAvailable<AliasAnalysis>();
     MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>();
   }
-  
+
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     if (PN->getIncomingValue(0) != PN)
       PN->replaceAllUsesWith(PN->getIncomingValue(0));
     else
       PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
-    
+
     if (MemDep)
       MemDep->removeInstruction(PN);  // Memdep updates AA itself.
     else if (AA && isa<PointerType>(PN->getType()))
       AA->deleteValue(PN);
-    
+
     PN->eraseFromParent();
   }
 }
@@ -115,7 +115,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   // Don't merge away blocks who have their address taken.
   if (BB->hasAddressTaken()) return false;
-  
+
   // Can't merge if there are multiple predecessors, or no predecessors.
   BasicBlock *PredBB = BB->getUniquePredecessor();
   if (!PredBB) return false;
@@ -124,7 +124,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   if (PredBB == BB) return false;
   // Don't break invokes.
   if (isa<InvokeInst>(PredBB->getTerminator())) return false;
-  
+
   succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
   BasicBlock *OnlySucc = BB;
   for (; SI != SE; ++SI)
@@ -132,7 +132,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
       OnlySucc = 0;     // There are multiple distinct successors!
       break;
     }
-  
+
   // Can't merge if there are multiple successors.
   if (!OnlySucc) return false;
 
@@ -149,21 +149,21 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   // Begin by getting rid of unneeded PHIs.
   if (isa<PHINode>(BB->front()))
     FoldSingleEntryPHINodes(BB, P);
-  
+
   // Delete the unconditional branch from the predecessor...
   PredBB->getInstList().pop_back();
-  
+
   // Make all PHI nodes that referred to BB now refer to Pred as their
   // source...
   BB->replaceAllUsesWith(PredBB);
-  
+
   // Move all definitions in the successor to the predecessor...
   PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
-  
+
   // Inherit predecessors name if it exists.
   if (!PredBB->hasName())
     PredBB->takeName(BB);
-  
+
   // Finally, erase the old block and update dominator info.
   if (P) {
     if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) {
@@ -176,16 +176,16 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
 
         DT->eraseNode(BB);
       }
-      
+
       if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>())
         LI->removeBlock(BB);
-      
+
       if (MemoryDependenceAnalysis *MD =
             P->getAnalysisIfAvailable<MemoryDependenceAnalysis>())
         MD->invalidateCachedPredecessors();
     }
   }
-  
+
   BB->eraseFromParent();
   return true;
 }
@@ -251,11 +251,11 @@ unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) {
   }
 }
 
-/// SplitEdge -  Split the edge connecting specified block. Pass P must 
-/// not be NULL. 
+/// SplitEdge -  Split the edge connecting specified block. Pass P must
+/// not be NULL.
 BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
   unsigned SuccNum = GetSuccessorNumber(BB, Succ);
-  
+
   // If this is a critical edge, let SplitCriticalEdge do it.
   TerminatorInst *LatchTerm = BB->getTerminator();
   if (SplitCriticalEdge(LatchTerm, SuccNum, P))
@@ -271,11 +271,11 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
     SP = NULL;
     return SplitBlock(Succ, Succ->begin(), P);
   }
-  
+
   // Otherwise, if BB has a single successor, split it at the bottom of the
   // block.
   assert(BB->getTerminator()->getNumSuccessors() == 1 &&
-         "Should have a single succ!"); 
+         "Should have a single succ!");
   return SplitBlock(BB, BB->getTerminator(), P);
 }
 
@@ -301,12 +301,12 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
     if (DomTreeNode *OldNode = DT->getNode(Old)) {
       std::vector<DomTreeNode *> Children;
       for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
-           I != E; ++I) 
+           I != E; ++I)
         Children.push_back(*I);
 
       DomTreeNode *NewNode = DT->addNewBlock(New,Old);
       for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
-             E = Children.end(); I != E; ++I) 
+             E = Children.end(); I != E; ++I)
         DT->changeImmediateDominator(*I, NewNode);
     }
   }
@@ -424,7 +424,7 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
       PHINode *NewPHI =
         PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
       if (AA) AA->copyValue(PN, NewPHI);
-      
+
       // Move all of the PHI values for 'Preds' to the new PHI.
       for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
         Value *V = PN->removeIncomingValue(Preds[i], false);
@@ -451,16 +451,16 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
 /// preserve LoopSimplify (because it's complicated to handle the case where one
 /// of the edges being split is an exit of a loop with other exits).
 ///
-BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
                                          ArrayRef<BasicBlock*> Preds,
                                          const char *Suffix, Pass *P) {
   // Create new basic block, insert right before the original block.
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
                                          BB->getParent(), BB);
-  
+
   // The new block unconditionally branches to the old block.
   BranchInst *BI = BranchInst::Create(BB, NewBB);
-  
+
   // Move the edges from Preds to point to NewBB instead of BB.
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     // This is slightly more strict than necessary; the minimum requirement
@@ -497,13 +497,13 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 /// block gets the remaining predecessors of OrigBB. The landingpad instruction
 /// OrigBB is clone into both of the new basic blocks. The new blocks are given
 /// the suffixes 'Suffix1' and 'Suffix2', and are returned in the NewBBs vector.
-/// 
+///
 /// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
 /// DominanceFrontier, LoopInfo, and LCCSA but no other analyses. In particular,
 /// it does not preserve LoopSimplify (because it's complicated to handle the
 /// case where one of the edges being split is an exit of a loop with other
 /// exits).
-/// 
+///
 void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                                        ArrayRef<BasicBlock*> Preds,
                                        const char *Suffix1, const char *Suffix2,
@@ -608,11 +608,11 @@ void llvm::FindFunctionBackedges(const Function &F,
   const BasicBlock *BB = &F.getEntryBlock();
   if (succ_begin(BB) == succ_end(BB))
     return;
-  
+
   SmallPtrSet<const BasicBlock*, 8> Visited;
   SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack;
   SmallPtrSet<const BasicBlock*, 8> InStack;
-  
+
   Visited.insert(BB);
   VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
   InStack.insert(BB);
@@ -620,7 +620,7 @@ void llvm::FindFunctionBackedges(const Function &F,
     std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back();
     const BasicBlock *ParentBB = Top.first;
     succ_const_iterator &I = Top.second;
-    
+
     bool FoundNew = false;
     while (I != succ_end(ParentBB)) {
       BB = *I++;
@@ -632,7 +632,7 @@ void llvm::FindFunctionBackedges(const Function &F,
       if (InStack.count(BB))
         Result.push_back(std::make_pair(ParentBB, BB));
     }
-    
+
     if (FoundNew) {
       // Go down one level if there is a unvisited successor.
       InStack.insert(BB);
@@ -641,7 +641,7 @@ void llvm::FindFunctionBackedges(const Function &F,
       // Go up one level.
       InStack.erase(VisitStack.pop_back_val().first);
     }
-  } while (!VisitStack.empty()); 
+  } while (!VisitStack.empty());
 }
 
 /// FoldReturnIntoUncondBranch - This method duplicates the specified return
@@ -655,7 +655,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   // Clone the return and add it to the end of the predecessor.
   Instruction *NewRet = RI->clone();
   Pred->getInstList().push_back(NewRet);
-      
+
   // If the return instruction returns a value, and if the value was a
   // PHI node in "BB", propagate the right value into the return.
   for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
@@ -679,7 +679,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
       }
     }
   }
-      
+
   // Update any PHI nodes in the returning block to realize that we no
   // longer branch to them.
   BB->removePredecessor(Pred);
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index bf540b0..6d13217 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -38,16 +38,16 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen",
                                             AttributeSet::get(M->getContext(),
-                                                             AWI),
+                                                              AS),
                                             TD->getIntPtrType(Context),
                                             B.getInt8PtrTy(),
                                             NULL);
@@ -67,16 +67,16 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrNLen = M->getOrInsertFunction("strnlen",
                                              AttributeSet::get(M->getContext(),
-                                                              AWI),
+                                                              AS),
                                              TD->getIntPtrType(Context),
                                              B.getInt8PtrTy(),
                                              TD->getIntPtrType(Context),
@@ -98,15 +98,15 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AttributeWithIndex AWI =
-    AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AttributeSet AS =
+    AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                      ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr = M->getOrInsertFunction("strchr",
                                             AttributeSet::get(M->getContext(),
-                                                             AWI),
+                                                             AS),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
                                ConstantInt::get(I32Ty, C), "strchr");
@@ -123,17 +123,17 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *StrNCmp = M->getOrInsertFunction("strncmp",
                                           AttributeSet::get(M->getContext(),
-                                                           AWI),
+                                                           AS),
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
                                           B.getInt8PtrTy(),
@@ -156,13 +156,13 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name,
-                                         AttributeSet::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Name);
@@ -180,14 +180,14 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name,
                                           AttributeSet::get(M->getContext(),
-                                                           AWI),
+                                                            AS),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
   CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
@@ -207,12 +207,12 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI;
-  AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                Attribute::NoUnwind);
+  AttributeSet AS;
+  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                         Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
-                                         AttributeSet::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -235,13 +235,13 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI;
+  AttributeSet AS;
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                         ArrayRef<Attribute::AttrKind>(AVs, 2));
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemChr = M->getOrInsertFunction("memchr",
-                                         AttributeSet::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
@@ -263,16 +263,16 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCmp = M->getOrInsertFunction("memcmp",
-                                         AttributeSet::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -344,13 +344,13 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
 
   Value *PutS = M->getOrInsertFunction("puts",
-                                       AttributeSet::get(M->getContext(), AWI),
+                                       AttributeSet::get(M->getContext(), AS),
                                        B.getInt32Ty(),
                                        B.getInt8PtrTy(),
                                        NULL);
@@ -368,14 +368,14 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction("fputc",
-                               AttributeSet::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AS),
                                B.getInt32Ty(),
                                B.getInt32Ty(), File->getType(),
                                NULL);
@@ -401,16 +401,16 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction(FPutsName,
-                               AttributeSet::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AS),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
@@ -434,17 +434,17 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 4, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 4, Attribute::NoCapture);
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction(FWriteName,
-                               AttributeSet::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AS),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index ccc3eae..a309bce 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -95,18 +95,16 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
            E = OldFunc->arg_end(); I != E; ++I)
       if (Argument* Anew = dyn_cast<Argument>(VMap[I]))
-        Anew->addAttr( OldFunc->getAttributes()
+        Anew->addAttr(OldFunc->getAttributes()
                        .getParamAttributes(I->getArgNo() + 1));
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(NewFunc->getContext(),
-                                    AttributeSet::ReturnIndex,
-                                    OldFunc->getAttributes()
-                                     .getRetAttributes()));
+                           .addAttributes(NewFunc->getContext(),
+                                          AttributeSet::ReturnIndex,
+                                          OldFunc->getAttributes()));
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(NewFunc->getContext(),
-                                    AttributeSet::FunctionIndex,
-                                    OldFunc->getAttributes()
-                                     .getFnAttributes()));
+                           .addAttributes(NewFunc->getContext(),
+                                          AttributeSet::FunctionIndex,
+                                          OldFunc->getAttributes()));
 
   }
 
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 3a21528..f7c659f 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index d5c41f5..db525cd 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/Function.h"
@@ -78,12 +79,21 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     InsertPt = &I;
     ++InsertPt;
   } else {
-    // We cannot demote invoke instructions to the stack if their normal edge
-    // is critical.
     InvokeInst &II = cast<InvokeInst>(I);
-    assert(II.getNormalDest()->getSinglePredecessor() &&
-           "Cannot demote invoke with a critical successor!");
-    InsertPt = II.getNormalDest()->begin();
+    if (II.getNormalDest()->getSinglePredecessor())
+      InsertPt = II.getNormalDest()->getFirstInsertionPt();
+    else {
+      // We cannot demote invoke instructions to the stack if their normal edge
+      // is critical.  Therefore, split the critical edge and insert the store
+      // in the newly created basic block.
+      unsigned SuccNum = GetSuccessorNumber(I.getParent(), II.getNormalDest());
+      TerminatorInst *TI = &cast<TerminatorInst>(I);
+      assert (isCriticalEdge(TI, SuccNum) &&
+              "Expected a critical edge!");
+      BasicBlock *BB = SplitCriticalEdge(TI, SuccNum);
+      assert (BB && "Unable to split critical edge.");
+      InsertPt = BB->getFirstInsertionPt();
+    }
   }
 
   for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
index 5187d7c..3cb8ded 100644
--- a/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -418,3 +418,107 @@ bool llvm::expandDivision(BinaryOperator *Div) {
 
   return true;
 }
+
+/// Generate code to compute the remainder of two integers of bitwidth up to 
+/// 32 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 32 bits; that is, these routines are good for targets
+/// that have no or very little suppport for smaller than 32 bit integer 
+/// arithmetic.
+///
+/// @brief Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+          "Trying to expand remainder from a non-remainder function");
+
+  Type *RemTy = Rem->getType();
+  if (RemTy->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+  if (RemTyBitWidth > 32) 
+    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+
+  if (RemTyBitWidth == 32) 
+    return expandRemainder(Rem);
+
+  // If bitwidth smaller than 32 extend inputs, truncate output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Rem);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtRem;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Rem->getOpcode() == Instruction::SRem) {
+    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+  Rem->replaceAllUsesWith(Trunc);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+
+/// Generate code to divide two integers of bitwidth up to 32 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 32 bits; that is, these routines are good for targets that have no
+/// or very little support for smaller than 32 bit integer arithmetic.
+///
+/// @brief Replace Div with emulation code.
+bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+          "Trying to expand division from a non-division function");
+
+  Type *DivTy = Div->getType();
+  if (DivTy->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+  if (DivTyBitWidth > 32)
+    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+
+  if (DivTyBitWidth == 32)
+    return expandDivision(Div);
+
+  // If bitwidth smaller than 32 extend inputs, truncate output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Div);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtDiv;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Div->getOpcode() == Instruction::SDiv) {
+    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);  
+  }
+  Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+  Div->replaceAllUsesWith(Trunc);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index d519fb7..3716f58 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -72,13 +72,23 @@ namespace {
 
       // Rename all aliases
       for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end();
-           AI != AE; ++AI)
-        AI->setName("alias");
+           AI != AE; ++AI) {
+        StringRef Name = AI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
 
+        AI->setName("alias");
+      }
+      
       // Rename all global variables
       for (Module::global_iterator GI = M.global_begin(), GE = M.global_end();
-           GI != GE; ++GI)
+           GI != GE; ++GI) {
+        StringRef Name = GI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
         GI->setName("global");
+      }
 
       // Rename all struct types
       TypeFinder StructTypes;
@@ -95,6 +105,10 @@ namespace {
       // Rename all functions
       for (Module::iterator FI = M.begin(), FE = M.end();
            FI != FE; ++FI) {
+        StringRef Name = FI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
         FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]);
         runOnFunction(*FI);
       }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index f10c35f..a63d31d 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1332,149 +1332,180 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   return Changed;
 }
 
-/// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1
-/// and an BB2 and the only successor of BB1 is BB2, hoist simple code
-/// (for now, restricted to a single instruction that's side effect free) from
-/// the BB1 into the branch block to speculatively execute it.
+/// \brief Speculate a conditional basic block flattening the CFG.
 ///
-/// Turn
-/// BB:
-///     %t1 = icmp
-///     br i1 %t1, label %BB1, label %BB2
-/// BB1:
-///     %t3 = add %t2, c
+/// Note that this is a very risky transform currently. Speculating
+/// instructions like this is most often not desirable. Instead, there is an MI
+/// pass which can do it with full awareness of the resource constraints.
+/// However, some cases are "obvious" and we should do directly. An example of
+/// this is speculating a single, reasonably cheap instruction.
+///
+/// There is only one distinct advantage to flattening the CFG at the IR level:
+/// it makes very common but simplistic optimizations such as are common in
+/// instcombine and the DAG combiner more powerful by removing CFG edges and
+/// modeling their effects with easier to reason about SSA value graphs.
+///
+///
+/// An illustration of this transform is turning this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     %sub = sub %x, %y
 ///     br label BB2
-/// BB2:
-/// =>
-/// BB:
-///     %t1 = icmp
-///     %t4 = add %t2, c
-///     %t3 = select i1 %t1, %t2, %t3
-static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
-  // Only speculatively execution a single instruction (not counting the
-  // terminator) for now.
-  Instruction *HInst = NULL;
-  Instruction *Term = BB1->getTerminator();
-  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end();
+///   EndBB:
+///     %phi = phi [ %sub, %ThenBB ], [ 0, %EndBB ]
+///     ...
+/// \endcode
+///
+/// Into this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     %sub = sub %x, %y
+///     %cond = select i1 %cmp, 0, %sub
+///     ...
+/// \endcode
+///
+/// \returns true if the conditional block is removed.
+static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
+  // Be conservative for now. FP select instruction can often be expensive.
+  Value *BrCond = BI->getCondition();
+  if (isa<FCmpInst>(BrCond))
+    return false;
+
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0);
+
+  // If ThenBB is actually on the false edge of the conditional branch, remember
+  // to swap the select operands later.
+  bool Invert = false;
+  if (ThenBB != BI->getSuccessor(0)) {
+    assert(ThenBB == BI->getSuccessor(1) && "No edge from 'if' block?");
+    Invert = true;
+  }
+  assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block");
+
+  // Keep a count of how many times instructions are used within CondBB when
+  // they are candidates for sinking into CondBB. Specifically:
+  // - They are defined in BB, and
+  // - They have no side effects, and
+  // - All of their uses are in CondBB.
+  SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
+
+  unsigned SpeculationCost = 0;
+  for (BasicBlock::iterator BBI = ThenBB->begin(),
+                            BBE = llvm::prior(ThenBB->end());
        BBI != BBE; ++BBI) {
     Instruction *I = BBI;
     // Skip debug info.
-    if (isa<DbgInfoIntrinsic>(I)) continue;
-    if (I == Term) break;
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
 
-    if (HInst)
+    // Only speculatively execution a single instruction (not counting the
+    // terminator) for now.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
       return false;
-    HInst = I;
-  }
-
-  BasicBlock *BIParent = BI->getParent();
 
-  // Check the instruction to be hoisted, if there is one.
-  if (HInst) {
     // Don't hoist the instruction if it's unsafe or expensive.
-    if (!isSafeToSpeculativelyExecute(HInst))
+    if (!isSafeToSpeculativelyExecute(I))
       return false;
-    if (ComputeSpeculationCost(HInst) > PHINodeFoldingThreshold)
+    if (ComputeSpeculationCost(I) > PHINodeFoldingThreshold)
       return false;
 
     // Do not hoist the instruction if any of its operands are defined but not
     // used in this BB. The transformation will prevent the operand from
     // being sunk into the use block.
-    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end();
+    for (User::op_iterator i = I->op_begin(), e = I->op_end();
          i != e; ++i) {
       Instruction *OpI = dyn_cast<Instruction>(*i);
-      if (OpI && OpI->getParent() == BIParent &&
-          !OpI->mayHaveSideEffects() &&
-          !OpI->isUsedInBasicBlock(BIParent))
-        return false;
+      if (!OpI || OpI->getParent() != BB ||
+          OpI->mayHaveSideEffects())
+        continue; // Not a candidate for sinking.
+
+      ++SinkCandidateUseCounts[OpI];
     }
   }
 
-  // Be conservative for now. FP select instruction can often be expensive.
-  Value *BrCond = BI->getCondition();
-  if (isa<FCmpInst>(BrCond))
-    return false;
-
-  // If BB1 is actually on the false edge of the conditional branch, remember
-  // to swap the select operands later.
-  bool Invert = false;
-  if (BB1 != BI->getSuccessor(0)) {
-    assert(BB1 == BI->getSuccessor(1) && "No edge from 'if' block?");
-    Invert = true;
-  }
+  // Consider any sink candidates which are only used in CondBB as costs for
+  // speculation. Note, while we iterate over a DenseMap here, we are summing
+  // and so iteration order isn't significant.
+  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator I =
+           SinkCandidateUseCounts.begin(), E = SinkCandidateUseCounts.end();
+       I != E; ++I)
+    if (I->first->getNumUses() == I->second) {
+      ++SpeculationCost;
+      if (SpeculationCost > 1)
+        return false;
+    }
 
-  // Collect interesting PHIs, and scan for hazards.
-  SmallSetVector<std::pair<Value *, Value *>, 4> PHIs;
-  BasicBlock *BB2 = BB1->getTerminator()->getSuccessor(0);
-  for (BasicBlock::iterator I = BB2->begin();
+  // Check that the PHI nodes can be converted to selects.
+  bool HaveRewritablePHIs = false;
+  for (BasicBlock::iterator I = EndBB->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-    Value *BB1V = PN->getIncomingValueForBlock(BB1);
-    Value *BIParentV = PN->getIncomingValueForBlock(BIParent);
+    Value *OrigV = PN->getIncomingValueForBlock(BB);
+    Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
 
     // Skip PHIs which are trivial.
-    if (BB1V == BIParentV)
+    if (ThenV == OrigV)
       continue;
 
-    // Check for safety.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BB1V)) {
-      // An unfolded ConstantExpr could end up getting expanded into
-      // Instructions. Don't speculate this and another instruction at
-      // the same time.
-      if (HInst)
-        return false;
-      if (!isSafeToSpeculativelyExecute(CE))
-        return false;
-      if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
-        return false;
-    }
+    HaveRewritablePHIs = true;
+    ConstantExpr *CE = dyn_cast<ConstantExpr>(ThenV);
+    if (!CE)
+      continue; // Known safe and cheap.
+
+    if (!isSafeToSpeculativelyExecute(CE))
+      return false;
+    if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
+      return false;
 
-    // Ok, we may insert a select for this PHI.
-    PHIs.insert(std::make_pair(BB1V, BIParentV));
+    // Account for the cost of an unfolded ConstantExpr which could end up
+    // getting expanded into Instructions.
+    // FIXME: This doesn't account for how many operations are combined in the
+    // constant expression.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
+      return false;
   }
 
   // If there are no PHIs to process, bail early. This helps ensure idempotence
   // as well.
-  if (PHIs.empty())
+  if (!HaveRewritablePHIs)
     return false;
 
   // If we get here, we can hoist the instruction and if-convert.
-  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *BB1 << "\n";);
+  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
-  // Hoist the instruction.
-  if (HInst)
-    BIParent->getInstList().splice(BI, BB1->getInstList(), HInst);
+  // Hoist the instructions.
+  BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(),
+                           llvm::prior(ThenBB->end()));
 
   // Insert selects and rewrite the PHI operands.
   IRBuilder<true, NoFolder> Builder(BI);
-  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
-    Value *TrueV = PHIs[i].first;
-    Value *FalseV = PHIs[i].second;
+  for (BasicBlock::iterator I = EndBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    unsigned OrigI = PN->getBasicBlockIndex(BB);
+    unsigned ThenI = PN->getBasicBlockIndex(ThenBB);
+    Value *OrigV = PN->getIncomingValue(OrigI);
+    Value *ThenV = PN->getIncomingValue(ThenI);
+
+    // Skip PHIs which are trivial.
+    if (OrigV == ThenV)
+      continue;
 
     // Create a select whose true value is the speculatively executed value and
-    // false value is the previously determined FalseV.
-    SelectInst *SI;
+    // false value is the preexisting value. Swap them if the branch
+    // destinations were inverted.
+    Value *TrueV = ThenV, *FalseV = OrigV;
     if (Invert)
-      SI = cast<SelectInst>
-        (Builder.CreateSelect(BrCond, FalseV, TrueV,
-                              FalseV->getName() + "." + TrueV->getName()));
-    else
-      SI = cast<SelectInst>
-        (Builder.CreateSelect(BrCond, TrueV, FalseV,
-                              TrueV->getName() + "." + FalseV->getName()));
-
-    // Make the PHI node use the select for all incoming values for "then" and
-    // "if" blocks.
-    for (BasicBlock::iterator I = BB2->begin();
-         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-      unsigned BB1I = PN->getBasicBlockIndex(BB1);
-      unsigned BIParentI = PN->getBasicBlockIndex(BIParent);
-      Value *BB1V = PN->getIncomingValue(BB1I);
-      Value *BIParentV = PN->getIncomingValue(BIParentI);
-      if (TrueV == BB1V && FalseV == BIParentV) {
-        PN->setIncomingValue(BB1I, SI);
-        PN->setIncomingValue(BIParentI, SI);
-      }
-    }
+      std::swap(TrueV, FalseV);
+    Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV,
+                                    TrueV->getName() + "." + FalseV->getName());
+    PN->setIncomingValue(OrigI, V);
+    PN->setIncomingValue(ThenI, V);
   }
 
   ++NumSpeculations;
@@ -3382,7 +3413,8 @@ SwitchLookupTable::SwitchLookupTable(Module &M,
                                      ConstantInt *Offset,
                const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
                                      Constant *DefaultValue,
-                                     const DataLayout *TD) {
+                                     const DataLayout *TD)
+    : SingleValue(0), BitMap(0), BitMapElementTy(0), Array(0) {
   assert(Values.size() && "Can't build lookup table without values!");
   assert(TableSize >= Values.size() && "Can't fit values in table!");
 
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 83c74e7..8ad566c 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -50,6 +50,10 @@ public:
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
     =0;
 
+  /// ignoreCallingConv - Returns false if this transformation could possibly
+  /// change the calling convention.
+  virtual bool ignoreCallingConv() { return false; }
+
   Value *optimizeCall(CallInst *CI, const DataLayout *TD,
                       const TargetLibraryInfo *TLI,
                       const LibCallSimplifier *LCS, IRBuilder<> &B) {
@@ -61,7 +65,7 @@ public:
       Context = &CI->getCalledFunction()->getContext();
 
     // We never change the calling convention.
-    if (CI->getCallingConv() != llvm::CallingConv::C)
+    if (!ignoreCallingConv() && CI->getCallingConv() != llvm::CallingConv::C)
       return NULL;
 
     return callOptimizer(CI->getCalledFunction(), CI, B);
@@ -724,6 +728,7 @@ struct StrNCpyOpt : public LibCallOptimization {
 };
 
 struct StrLenOpt : public LibCallOptimization {
+  virtual bool ignoreCallingConv() { return true; }
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 1 ||
@@ -1260,6 +1265,7 @@ struct FFSOpt : public LibCallOptimization {
 };
 
 struct AbsOpt : public LibCallOptimization {
+  virtual bool ignoreCallingConv() { return true; }
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     FunctionType *FT = Callee->getFunctionType();
     // We require integer(integer) where the types agree.
@@ -1883,6 +1889,7 @@ LibCallSimplifier::~LibCallSimplifier() {
 }
 
 Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+  if (CI->hasFnAttr(Attribute::NoBuiltin)) return 0;
   return Impl->optimizeCall(CI);
 }
 
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index a5e1643..b5941bd 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -63,14 +63,29 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     // Check all operands to see if any need to be remapped.
     for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
       Value *OP = MD->getOperand(i);
-      if (OP == 0 || MapValue(OP, VM, Flags, TypeMapper) == OP) continue;
+      if (OP == 0) continue;
+      Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper);
+      // Use identity map if Mapped_Op is null and we can ignore missing
+      // entries.
+      if (Mapped_OP == OP ||
+          (Mapped_OP == 0 && (Flags & RF_IgnoreMissingEntries)))
+        continue;
 
       // Ok, at least one operand needs remapping.  
       SmallVector<Value*, 4> Elts;
       Elts.reserve(MD->getNumOperands());
       for (i = 0; i != e; ++i) {
         Value *Op = MD->getOperand(i);
-        Elts.push_back(Op ? MapValue(Op, VM, Flags, TypeMapper) : 0);
+        if (Op == 0)
+          Elts.push_back(0);
+        else {
+          Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper);
+          // Use identity map if Mapped_Op is null and we can ignore missing
+          // entries.
+          if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries))
+            Mapped_Op = Op;
+          Elts.push_back(Mapped_Op);
+        }
       }
       MDNode *NewMD = MDNode::get(V->getContext(), Elts);
       Dummy->replaceAllUsesWith(NewMD);
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index d72a4a1..7636541 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -48,7 +48,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
-#include <map>
 using namespace llvm;
 
 static cl::opt<bool>
@@ -89,6 +88,10 @@ MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
   cl::desc("The maximum number of pairable instructions per group"));
 
 static cl::opt<unsigned>
+MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
+  cl::desc("The maximum number of candidate instruction pairs per group"));
+
+static cl::opt<unsigned>
 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
   cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
                        " a full cycle check"));
@@ -207,11 +210,6 @@ namespace {
     typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
     typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
     typedef std::pair<VPPair, unsigned> VPPairWithType;
-    typedef std::pair<std::multimap<Value *, Value *>::iterator,
-              std::multimap<Value *, Value *>::iterator> VPIteratorPair;
-    typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
-              std::multimap<ValuePair, ValuePair>::iterator>
-                VPPIteratorPair;
 
     AliasAnalysis *AA;
     DominatorTree *DT;
@@ -225,7 +223,7 @@ namespace {
 
     bool getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
-                       std::multimap<Value *, Value *> &CandidatePairs,
+                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
                        DenseSet<ValuePair> &FixedOrderPairs,
                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len);
@@ -239,33 +237,36 @@ namespace {
       PairConnectionSplat
     };
 
-    void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts,
-                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                       DenseMap<VPPair, unsigned> &PairConnectionTypes);
+    void computeConnectedPairs(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes);
 
     void buildDepMap(BasicBlock &BB,
-                       std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts,
-                       DenseSet<ValuePair> &PairableInstUsers);
-
-    void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
-                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                        std::vector<Value *> &PairableInsts,
-                        DenseSet<ValuePair> &FixedOrderPairs,
-                        DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                        std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                        std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                        DenseSet<ValuePair> &PairableInstUsers,
-                        DenseMap<Value *, Value *>& ChosenPairs);
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &PairableInstUsers);
+
+    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             DenseMap<ValuePair, int> &CandidatePairCostSavings,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<Value *, Value *>& ChosenPairs);
 
     void fuseChosenPairs(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *>& ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs,
-                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps);
+             std::vector<Value *> &PairableInsts,
+             DenseMap<Value *, Value *>& ChosenPairs,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
 
 
     bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
@@ -277,56 +278,63 @@ namespace {
     bool trackUsesOfI(DenseSet<Value *> &Users,
                       AliasSetTracker &WriteSet, Instruction *I,
                       Instruction *J, bool UpdateUsers = true,
-                      std::multimap<Value *, Value *> *LoadMoveSet = 0);
+                      DenseSet<ValuePair> *LoadMoveSetPairs = 0);
 
-    void computePairsConnectedTo(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      ValuePair P);
+  void computePairsConnectedTo(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             ValuePair P);
 
     bool pairsConflict(ValuePair P, ValuePair Q,
-                 DenseSet<ValuePair> &PairableInstUsers,
-                 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> >
+               *PairableInstUserMap = 0,
+             DenseSet<VPPair> *PairableInstUserPairSet = 0);
 
     bool pairWillFormCycle(ValuePair P,
-                       std::multimap<ValuePair, ValuePair> &PairableInstUsers,
-                       DenseSet<ValuePair> &CurrentPairs);
-
-    void pruneTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree,
-                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
-                      bool UseCycleCheck);
-
-    void buildInitialTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree, ValuePair J);
-
-    void findBestTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &FixedOrderPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
-                      int &BestEffSize, VPIteratorPair ChoiceRange,
-                      bool UseCycleCheck);
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
+             DenseSet<ValuePair> &CurrentPairs);
+
+    void pruneDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<VPPair> &PairableInstUserPairSet,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseMap<ValuePair, size_t> &DAG,
+             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+             bool UseCycleCheck);
+
+    void buildInitialDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
+
+    void findBestDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             DenseMap<ValuePair, int> &CandidatePairCostSavings,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<VPPair> &PairableInstUserPairSet,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+             bool UseCycleCheck);
 
     Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
                      Instruction *J, unsigned o);
@@ -358,20 +366,22 @@ namespace {
 
     void collectPairLoadMoveSet(BasicBlock &BB,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I);
 
     void collectLoadMoveSet(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet);
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs);
 
     bool canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I, Instruction *J);
 
     void moveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J);
 
@@ -463,18 +473,18 @@ namespace {
 
     static inline void getInstructionTypes(Instruction *I,
                                            Type *&T1, Type *&T2) {
-      if (isa<StoreInst>(I)) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         // For stores, it is the value type, not the pointer type that matters
         // because the value is what will come from a vector register.
   
-        Value *IVal = cast<StoreInst>(I)->getValueOperand();
+        Value *IVal = SI->getValueOperand();
         T1 = IVal->getType();
       } else {
         T1 = I->getType();
       }
   
-      if (I->isCast())
-        T2 = cast<CastInst>(I)->getSrcTy();
+      if (CastInst *CI = dyn_cast<CastInst>(I))
+        T2 = CI->getSrcTy();
       else
         T2 = T1;
 
@@ -500,7 +510,7 @@ namespace {
       // InsertElement and ExtractElement have a depth factor of zero. This is
       // for two reasons: First, they cannot be usefully fused. Second, because
       // the pass generates a lot of these, they can confuse the simple metric
-      // used to compare the trees in the next iteration. Thus, giving them a
+      // used to compare the dags in the next iteration. Thus, giving them a
       // weight of zero allows the pass to essentially ignore them in
       // subsequent iterations when looking for vectorization opportunities
       // while still tracking dependency chains that flow through those
@@ -661,19 +671,6 @@ namespace {
       }
     }
 
-    // Returns true if J is the second element in some pair referenced by
-    // some multimap pair iterator pair.
-    template <typename V>
-    bool isSecondInIteratorPair(V J, std::pair<
-           typename std::multimap<V, V>::iterator,
-           typename std::multimap<V, V>::iterator> PairRange) {
-      for (typename std::multimap<V, V>::iterator K = PairRange.first;
-           K != PairRange.second; ++K)
-        if (K->second == J) return true;
-
-      return false;
-    }
-
     bool isPureIEChain(InsertElementInst *IE) {
       InsertElementInst *IENext = IE;
       do {
@@ -698,11 +695,12 @@ namespace {
     DenseMap<Value *, Value *> AllChosenPairs;
     DenseSet<ValuePair> AllFixedOrderPairs;
     DenseMap<VPPair, unsigned> AllPairConnectionTypes;
-    std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps;
+    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
+                                                 AllConnectedPairDeps;
 
     do {
       std::vector<Value *> PairableInsts;
-      std::multimap<Value *, Value *> CandidatePairs;
+      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
       DenseSet<ValuePair> FixedOrderPairs;
       DenseMap<ValuePair, int> CandidatePairCostSavings;
       ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
@@ -711,6 +709,14 @@ namespace {
                                          PairableInsts, NonPow2Len);
       if (PairableInsts.empty()) continue;
 
+      // Build the candidate pair set for faster lookups.
+      DenseSet<ValuePair> CandidatePairsSet;
+      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
+           CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
+        for (std::vector<Value *>::iterator J = I->second.begin(),
+             JE = I->second.end(); J != JE; ++J)
+          CandidatePairsSet.insert(ValuePair(I->first, *J));
+
       // Now we have a map of all of the pairable instructions and we need to
       // select the best possible pairing. A good pairing is one such that the
       // users of the pair are also paired. This defines a (directed) forest
@@ -720,30 +726,33 @@ namespace {
       // Note that it only matters that both members of the second pair use some
       // element of the first pair (to allow for splatting).
 
-      std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps;
+      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
+                                                   ConnectedPairDeps;
       DenseMap<VPPair, unsigned> PairConnectionTypes;
-      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs,
-                            PairConnectionTypes);
+      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
+                            PairableInsts, ConnectedPairs, PairConnectionTypes);
       if (ConnectedPairs.empty()) continue;
 
-      for (std::multimap<ValuePair, ValuePair>::iterator
+      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
            I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I) {
-        ConnectedPairDeps.insert(VPPair(I->second, I->first));
-      }
+           I != IE; ++I)
+        for (std::vector<ValuePair>::iterator J = I->second.begin(),
+             JE = I->second.end(); J != JE; ++J)
+          ConnectedPairDeps[*J].push_back(I->first);
 
       // Build the pairable-instruction dependency map
       DenseSet<ValuePair> PairableInstUsers;
       buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
 
       // There is now a graph of the connected pairs. For each variable, pick
-      // the pairing with the largest tree meeting the depth requirement on at
-      // least one branch. Then select all pairings that are part of that tree
+      // the pairing with the largest dag meeting the depth requirement on at
+      // least one branch. Then select all pairings that are part of that dag
       // and remove them from the list of available pairings and pairable
       // variables.
 
       DenseMap<Value *, Value *> ChosenPairs;
-      choosePairs(CandidatePairs, CandidatePairCostSavings,
+      choosePairs(CandidatePairs, CandidatePairsSet,
+        CandidatePairCostSavings,
         PairableInsts, FixedOrderPairs, PairConnectionTypes,
         ConnectedPairs, ConnectedPairDeps,
         PairableInstUsers, ChosenPairs);
@@ -777,14 +786,15 @@ namespace {
         }
       }
 
-      for (std::multimap<ValuePair, ValuePair>::iterator
+      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
            I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I) {
-        if (AllPairConnectionTypes.count(*I)) {
-          AllConnectedPairs.insert(*I);
-          AllConnectedPairDeps.insert(VPPair(I->second, I->first));
-        }
-      }
+           I != IE; ++I)
+        for (std::vector<ValuePair>::iterator J = I->second.begin(),
+          JE = I->second.end(); J != JE; ++J)
+          if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
+            AllConnectedPairs[I->first].push_back(*J);
+            AllConnectedPairDeps[*J].push_back(I->first);
+          }
     } while (ShouldContinue);
 
     if (AllChosenPairs.empty()) return false;
@@ -910,7 +920,7 @@ namespace {
   // This function returns true if the two provided instructions are compatible
   // (meaning that they can be fused into a vector instruction). This assumes
   // that I has already been determined to be vectorizable and that J is not
-  // in the use tree of I.
+  // in the use dag of I.
   bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
                        bool IsSimpleLoadStore, bool NonPow2Len,
                        int &CostSavings, int &FixedOrder) {
@@ -972,6 +982,11 @@ namespace {
           unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
                                                 BottomAlignment,
                                                 IAddressSpace);
+
+          ICost += TTI->getAddressComputationCost(aTypeI);
+          JCost += TTI->getAddressComputationCost(aTypeJ);
+          VCost += TTI->getAddressComputationCost(VType);
+
           if (VCost > ICost + JCost)
             return false;
 
@@ -994,6 +1009,12 @@ namespace {
       unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
       Type *VT1 = getVecTypeForPair(IT1, JT1),
            *VT2 = getVecTypeForPair(IT2, JT2);
+
+      // Note that this procedure is incorrect for insert and extract element
+      // instructions (because combining these often results in a shuffle),
+      // but this cost is ignored (because insert and extract element
+      // instructions are assigned a zero depth factor and are not really
+      // fused in general).
       unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
 
       if (VCost > ICost + JCost)
@@ -1090,7 +1111,7 @@ namespace {
   // to contain any memory locations to which J writes. The function returns
   // true if J uses I. By default, alias analysis is used to determine
   // whether J reads from memory that overlaps with a location in WriteSet.
-  // If LoadMoveSet is not null, then it is a previously-computed multimap
+  // If LoadMoveSet is not null, then it is a previously-computed map
   // where the key is the memory-based user instruction and the value is
   // the instruction to be compared with I. So, if LoadMoveSet is provided,
   // then the alias analysis is not used. This is necessary because this
@@ -1100,7 +1121,7 @@ namespace {
   bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
                        AliasSetTracker &WriteSet, Instruction *I,
                        Instruction *J, bool UpdateUsers,
-                       std::multimap<Value *, Value *> *LoadMoveSet) {
+                       DenseSet<ValuePair> *LoadMoveSetPairs) {
     bool UsesI = false;
 
     // This instruction may already be marked as a user due, for example, to
@@ -1118,9 +1139,8 @@ namespace {
         }
       }
     if (!UsesI && J->mayReadFromMemory()) {
-      if (LoadMoveSet) {
-        VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
-        UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
+      if (LoadMoveSetPairs) {
+        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
       } else {
         for (AliasSetTracker::iterator W = WriteSet.begin(),
              WE = WriteSet.end(); W != WE; ++W) {
@@ -1144,10 +1164,11 @@ namespace {
   // basic block and collects all candidate pairs for vectorization.
   bool BBVectorize::getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
-                       std::multimap<Value *, Value *> &CandidatePairs,
+                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
                        DenseSet<ValuePair> &FixedOrderPairs,
                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len) {
+    size_t TotalPairs = 0;
     BasicBlock::iterator E = BB.end();
     if (Start == E) return false;
 
@@ -1193,7 +1214,8 @@ namespace {
           PairableInsts.push_back(I);
         }
 
-        CandidatePairs.insert(ValuePair(I, J));
+        CandidatePairs[I].push_back(J);
+        ++TotalPairs;
         if (TTI)
           CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
                                                             CostSavings));
@@ -1217,7 +1239,8 @@ namespace {
         // If we have already found too many pairs, break here and this function
         // will be called again starting after the last instruction selected
         // during this invocation.
-        if (PairableInsts.size() >= Config.MaxInsts) {
+        if (PairableInsts.size() >= Config.MaxInsts ||
+            TotalPairs >= Config.MaxPairs) {
           ShouldContinue = true;
           break;
         }
@@ -1237,11 +1260,12 @@ namespace {
   // it looks for pairs such that both members have an input which is an
   // output of PI or PJ.
   void BBVectorize::computePairsConnectedTo(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      ValuePair P) {
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                  ValuePair P) {
     StoreInst *SI, *SJ;
 
     // For each possible pairing for this variable, look at the uses of
@@ -1259,8 +1283,6 @@ namespace {
         continue;
       }
 
-      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
-
       // For each use of the first variable, look for uses of the second
       // variable...
       for (Value::use_iterator J = P.second->use_begin(),
@@ -1269,19 +1291,17 @@ namespace {
             P.second == SJ->getPointerOperand())
           continue;
 
-        VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
-
         // Look for <I, J>:
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
           VPPair VP(P, ValuePair(*I, *J));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
         }
 
         // Look for <J, I>:
-        if (isSecondInIteratorPair<Value*>(*I, JPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*J, *I))) {
           VPPair VP(P, ValuePair(*J, *I));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
         }
       }
@@ -1294,9 +1314,9 @@ namespace {
             P.first == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
           VPPair VP(P, ValuePair(*I, *J));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
         }
       }
@@ -1313,16 +1333,14 @@ namespace {
                P.second == SI->getPointerOperand())
         continue;
 
-      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
-
       for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
         if ((SJ = dyn_cast<StoreInst>(*J)) &&
             P.second == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
           VPPair VP(P, ValuePair(*I, *J));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
         }
       }
@@ -1333,55 +1351,73 @@ namespace {
   // connected if some output of the first pair forms an input to both members
   // of the second pair.
   void BBVectorize::computeConnectedPairs(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes) {
-
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
          PE = PairableInsts.end(); PI != PE; ++PI) {
-      VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
+      DenseMap<Value *, std::vector<Value *> >::iterator PP =
+        CandidatePairs.find(*PI);
+      if (PP == CandidatePairs.end())
+        continue;
 
-      for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
-           P != choiceRange.second; ++P)
-        computePairsConnectedTo(CandidatePairs, PairableInsts,
-                                ConnectedPairs, PairConnectionTypes, *P);
+      for (std::vector<Value *>::iterator P = PP->second.begin(),
+           E = PP->second.end(); P != E; ++P)
+        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
+                                PairableInsts, ConnectedPairs,
+                                PairConnectionTypes, ValuePair(*PI, *P));
     }
 
-    DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
+    DEBUG(size_t TotalPairs = 0;
+          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
+               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
+            TotalPairs += I->second.size();
+          dbgs() << "BBV: found " << TotalPairs
                  << " pair connections.\n");
   }
 
   // This function builds a set of use tuples such that <A, B> is in the set
-  // if B is in the use tree of A. If B is in the use tree of A, then B
+  // if B is in the use dag of A. If B is in the use dag of A, then B
   // depends on the output of A.
   void BBVectorize::buildDepMap(
                       BasicBlock &BB,
-                      std::multimap<Value *, Value *> &CandidatePairs,
+                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
                       DenseSet<ValuePair> &PairableInstUsers) {
     DenseSet<Value *> IsInPair;
-    for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
-         E = CandidatePairs.end(); C != E; ++C) {
+    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
+         CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
       IsInPair.insert(C->first);
-      IsInPair.insert(C->second);
+      IsInPair.insert(C->second.begin(), C->second.end());
     }
 
-    // Iterate through the basic block, recording all Users of each
+    // Iterate through the basic block, recording all users of each
     // pairable instruction.
 
-    BasicBlock::iterator E = BB.end();
+    BasicBlock::iterator E = BB.end(), EL =
+      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
     for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
       if (IsInPair.find(I) == IsInPair.end()) continue;
 
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
-      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
+      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
         (void) trackUsesOfI(Users, WriteSet, I, J);
 
+        if (J == EL)
+          break;
+      }
+
       for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
-           U != E; ++U)
+           U != E; ++U) {
+        if (IsInPair.find(*U) == IsInPair.end()) continue;
         PairableInstUsers.insert(ValuePair(I, *U));
+      }
+
+      if (I == EL)
+        break;
     }
   }
 
@@ -1389,8 +1425,9 @@ namespace {
   // input of pair Q is an output of pair P. If this is the case, then these
   // two pairs cannot be simultaneously fused.
   bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
-                     DenseSet<ValuePair> &PairableInstUsers,
-                     std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
+             DenseSet<VPPair> *PairableInstUserPairSet) {
     // Two pairs are in conflict if they are mutual Users of eachother.
     bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
                   PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
@@ -1403,17 +1440,14 @@ namespace {
     if (PairableInstUserMap) {
       // FIXME: The expensive part of the cycle check is not so much the cycle
       // check itself but this edge insertion procedure. This needs some
-      // profiling and probably a different data structure (same is true of
-      // most uses of std::multimap).
+      // profiling and probably a different data structure.
       if (PUsesQ) {
-        VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
-        if (!isSecondInIteratorPair(P, QPairRange))
-          PairableInstUserMap->insert(VPPair(Q, P));
+        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
+          (*PairableInstUserMap)[Q].push_back(P);
       }
       if (QUsesP) {
-        VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
-        if (!isSecondInIteratorPair(Q, PPairRange))
-          PairableInstUserMap->insert(VPPair(P, Q));
+        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
+          (*PairableInstUserMap)[P].push_back(Q);
       }
     }
 
@@ -1423,8 +1457,8 @@ namespace {
   // This function walks the use graph of current pairs to see if, starting
   // from P, the walk returns to P.
   bool BBVectorize::pairWillFormCycle(ValuePair P,
-                       std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                       DenseSet<ValuePair> &CurrentPairs) {
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<ValuePair> &CurrentPairs) {
     DEBUG(if (DebugCycleCheck)
             dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
                    << *P.second << "\n");
@@ -1441,36 +1475,41 @@ namespace {
       DEBUG(if (DebugCycleCheck)
               dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
                      << *QTop.second << "\n");
-      VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
-      for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
-           C != QPairRange.second; ++C) {
-        if (C->second == P) {
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        PairableInstUserMap.find(QTop);
+      if (QQ == PairableInstUserMap.end())
+        continue;
+
+      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
+           CE = QQ->second.end(); C != CE; ++C) {
+        if (*C == P) {
           DEBUG(dbgs()
                  << "BBV: rejected to prevent non-trivial cycle formation: "
-                 << *C->first.first << " <-> " << *C->first.second << "\n");
+                 << QTop.first << " <-> " << C->second << "\n");
           return true;
         }
 
-        if (CurrentPairs.count(C->second) && !Visited.count(C->second))
-          Q.push_back(C->second);
+        if (CurrentPairs.count(*C) && !Visited.count(*C))
+          Q.push_back(*C);
       }
     } while (!Q.empty());
 
     return false;
   }
 
-  // This function builds the initial tree of connected pairs with the
+  // This function builds the initial dag of connected pairs with the
   // pair J at the root.
-  void BBVectorize::buildInitialTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree, ValuePair J) {
-    // Each of these pairs is viewed as the root node of a Tree. The Tree
+  void BBVectorize::buildInitialDAGFor(
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseSet<ValuePair> &PairableInstUsers,
+                  DenseMap<Value *, Value *> &ChosenPairs,
+                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
+    // Each of these pairs is viewed as the root node of a DAG. The DAG
     // is then walked (depth-first). As this happens, we keep track of
-    // the pairs that compose the Tree and the maximum depth of the Tree.
+    // the pairs that compose the DAG and the maximum depth of the DAG.
     SmallVector<ValuePairWithDepth, 32> Q;
     // General depth-first post-order traversal:
     Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
@@ -1480,69 +1519,65 @@ namespace {
       // Push each child onto the queue:
       bool MoreChildren = false;
       size_t MaxChildDepth = QTop.second;
-      VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
-      for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first;
-           k != qtRange.second; ++k) {
-        // Make sure that this child pair is still a candidate:
-        bool IsStillCand = false;
-        VPIteratorPair checkRange =
-          CandidatePairs.equal_range(k->second.first);
-        for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
-             m != checkRange.second; ++m) {
-          if (m->second == k->second.second) {
-            IsStillCand = true;
-            break;
-          }
-        }
-
-        if (IsStillCand) {
-          DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
-          if (C == Tree.end()) {
-            size_t d = getDepthFactor(k->second.first);
-            Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
-            MoreChildren = true;
-          } else {
-            MaxChildDepth = std::max(MaxChildDepth, C->second);
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        ConnectedPairs.find(QTop.first);
+      if (QQ != ConnectedPairs.end())
+        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
+             ke = QQ->second.end(); k != ke; ++k) {
+          // Make sure that this child pair is still a candidate:
+          if (CandidatePairsSet.count(*k)) {
+            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
+            if (C == DAG.end()) {
+              size_t d = getDepthFactor(k->first);
+              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
+              MoreChildren = true;
+            } else {
+              MaxChildDepth = std::max(MaxChildDepth, C->second);
+            }
           }
         }
-      }
 
       if (!MoreChildren) {
-        // Record the current pair as part of the Tree:
-        Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
+        // Record the current pair as part of the DAG:
+        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
         Q.pop_back();
       }
     } while (!Q.empty());
   }
 
-  // Given some initial tree, prune it by removing conflicting pairs (pairs
+  // Given some initial dag, prune it by removing conflicting pairs (pairs
   // that cannot be simultaneously chosen for vectorization).
-  void BBVectorize::pruneTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree,
-                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
-                      bool UseCycleCheck) {
+  void BBVectorize::pruneDAGFor(
+              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+              std::vector<Value *> &PairableInsts,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+              DenseSet<ValuePair> &PairableInstUsers,
+              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+              DenseSet<VPPair> &PairableInstUserPairSet,
+              DenseMap<Value *, Value *> &ChosenPairs,
+              DenseMap<ValuePair, size_t> &DAG,
+              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+              bool UseCycleCheck) {
     SmallVector<ValuePairWithDepth, 32> Q;
     // General depth-first post-order traversal:
     Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
     do {
       ValuePairWithDepth QTop = Q.pop_back_val();
-      PrunedTree.insert(QTop.first);
+      PrunedDAG.insert(QTop.first);
 
       // Visit each child, pruning as necessary...
       SmallVector<ValuePairWithDepth, 8> BestChildren;
-      VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
-      for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
-           K != QTopRange.second; ++K) {
-        DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
-        if (C == Tree.end()) continue;
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        ConnectedPairs.find(QTop.first);
+      if (QQ == ConnectedPairs.end())
+        continue;
 
-        // This child is in the Tree, now we need to make sure it is the
+      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
+           KE = QQ->second.end(); K != KE; ++K) {
+        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
+        if (C == DAG.end()) continue;
+
+        // This child is in the DAG, now we need to make sure it is the
         // best of any conflicting children. There could be multiple
         // conflicting children, so first, determine if we're keeping
         // this child, then delete conflicting children as necessary.
@@ -1556,7 +1591,7 @@ namespace {
         // fusing (a,b) we have y .. a/b .. x where y is an input
         // to a/b and x is an output to a/b: x and y can no longer
         // be legally fused. To prevent this condition, we must
-        // make sure that a child pair added to the Tree is not
+        // make sure that a child pair added to the DAG is not
         // both an input and output of an already-selected pair.
 
         // Pairing-induced dependencies can also form from more complicated
@@ -1575,7 +1610,8 @@ namespace {
               C2->first.second == C->first.first ||
               C2->first.second == C->first.second ||
               pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             if (C2->second >= C->second) {
               CanAdd = false;
               break;
@@ -1587,15 +1623,16 @@ namespace {
         if (!CanAdd) continue;
 
         // Even worse, this child could conflict with another node already
-        // selected for the Tree. If that is the case, ignore this child.
-        for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(),
-             E2 = PrunedTree.end(); T != E2; ++T) {
+        // selected for the DAG. If that is the case, ignore this child.
+        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
+             E2 = PrunedDAG.end(); T != E2; ++T) {
           if (T->first == C->first.first ||
               T->first == C->first.second ||
               T->second == C->first.first ||
               T->second == C->first.second ||
               pairsConflict(*T, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             CanAdd = false;
             break;
           }
@@ -1612,7 +1649,8 @@ namespace {
               C2->first.second == C->first.first ||
               C2->first.second == C->first.second ||
               pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             CanAdd = false;
             break;
           }
@@ -1627,7 +1665,8 @@ namespace {
               ChosenPairs.begin(), E2 = ChosenPairs.end();
              C2 != E2; ++C2) {
           if (pairsConflict(*C2, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             CanAdd = false;
             break;
           }
@@ -1639,7 +1678,7 @@ namespace {
         // To check for non-trivial cycles formed by the addition of the
         // current pair we've formed a list of all relevant pairs, now use a
         // graph walk to check for a cycle. We start from the current pair and
-        // walk the use tree to see if we again reach the current pair. If we
+        // walk the use dag to see if we again reach the current pair. If we
         // do, then the current pair is rejected.
 
         // FIXME: It may be more efficient to use a topological-ordering
@@ -1676,34 +1715,40 @@ namespace {
     } while (!Q.empty());
   }
 
-  // This function finds the best tree of mututally-compatible connected
+  // This function finds the best dag of mututally-compatible connected
   // pairs, given the choice of root pairs as an iterator range.
-  void BBVectorize::findBestTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &FixedOrderPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
-                      int &BestEffSize, VPIteratorPair ChoiceRange,
-                      bool UseCycleCheck) {
-    for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
-         J != ChoiceRange.second; ++J) {
+  void BBVectorize::findBestDAGFor(
+              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+              DenseSet<ValuePair> &CandidatePairsSet,
+              DenseMap<ValuePair, int> &CandidatePairCostSavings,
+              std::vector<Value *> &PairableInsts,
+              DenseSet<ValuePair> &FixedOrderPairs,
+              DenseMap<VPPair, unsigned> &PairConnectionTypes,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+              DenseSet<ValuePair> &PairableInstUsers,
+              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+              DenseSet<VPPair> &PairableInstUserPairSet,
+              DenseMap<Value *, Value *> &ChosenPairs,
+              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+              bool UseCycleCheck) {
+    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
+         J != JE; ++J) {
+      ValuePair IJ(II, *J);
+      if (!CandidatePairsSet.count(IJ))
+        continue;
 
       // Before going any further, make sure that this pair does not
       // conflict with any already-selected pairs (see comment below
-      // near the Tree pruning for more details).
+      // near the DAG pruning for more details).
       DenseSet<ValuePair> ChosenPairSet;
       bool DoesConflict = false;
       for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
            E = ChosenPairs.end(); C != E; ++C) {
-        if (pairsConflict(*C, *J, PairableInstUsers,
-                          UseCycleCheck ? &PairableInstUserMap : 0)) {
+        if (pairsConflict(*C, IJ, PairableInstUsers,
+                          UseCycleCheck ? &PairableInstUserMap : 0,
+                          UseCycleCheck ? &PairableInstUserPairSet : 0)) {
           DoesConflict = true;
           break;
         }
@@ -1713,40 +1758,42 @@ namespace {
       if (DoesConflict) continue;
 
       if (UseCycleCheck &&
-          pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
+          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
         continue;
 
-      DenseMap<ValuePair, size_t> Tree;
-      buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                          PairableInstUsers, ChosenPairs, Tree, *J);
+      DenseMap<ValuePair, size_t> DAG;
+      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
+                          PairableInsts, ConnectedPairs,
+                          PairableInstUsers, ChosenPairs, DAG, IJ);
 
       // Because we'll keep the child with the largest depth, the largest
-      // depth is still the same in the unpruned Tree.
-      size_t MaxDepth = Tree.lookup(*J);
+      // depth is still the same in the unpruned DAG.
+      size_t MaxDepth = DAG.lookup(IJ);
 
-      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {"
-                   << *J->first << " <-> " << *J->second << "} of depth " <<
-                   MaxDepth << " and size " << Tree.size() << "\n");
+      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
+                   << IJ.first << " <-> " << IJ.second << "} of depth " <<
+                   MaxDepth << " and size " << DAG.size() << "\n");
 
-      // At this point the Tree has been constructed, but, may contain
+      // At this point the DAG has been constructed, but, may contain
       // contradictory children (meaning that different children of
-      // some tree node may be attempting to fuse the same instruction).
-      // So now we walk the tree again, in the case of a conflict,
+      // some dag node may be attempting to fuse the same instruction).
+      // So now we walk the dag again, in the case of a conflict,
       // keep only the child with the largest depth. To break a tie,
       // favor the first child.
 
-      DenseSet<ValuePair> PrunedTree;
-      pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                   PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
-                   PrunedTree, *J, UseCycleCheck);
+      DenseSet<ValuePair> PrunedDAG;
+      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                   PairableInstUsers, PairableInstUserMap,
+                   PairableInstUserPairSet,
+                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
 
       int EffSize = 0;
       if (TTI) {
-        DenseSet<Value *> PrunedTreeInstrs;
-        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-             E = PrunedTree.end(); S != E; ++S) {
-          PrunedTreeInstrs.insert(S->first);
-          PrunedTreeInstrs.insert(S->second);
+        DenseSet<Value *> PrunedDAGInstrs;
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S) {
+          PrunedDAGInstrs.insert(S->first);
+          PrunedDAGInstrs.insert(S->second);
         }
 
         // The set of pairs that have already contributed to the total cost.
@@ -1759,8 +1806,8 @@ namespace {
 
         // The node weights represent the cost savings associated with
         // fusing the pair of instructions.
-        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-             E = PrunedTree.end(); S != E; ++S) {
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S) {
           if (!isa<ShuffleVectorInst>(S->first) &&
               !isa<InsertElementInst>(S->first) &&
               !isa<ExtractElementInst>(S->first))
@@ -1778,15 +1825,17 @@ namespace {
 
           // The edge weights contribute in a negative sense: they represent
           // the cost of shuffles.
-          VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S);
-          if (IP.first != ConnectedPairDeps.end()) {
+          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
+            ConnectedPairDeps.find(*S);
+          if (SS != ConnectedPairDeps.end()) {
             unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-            for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-                 Q != IP.second; ++Q) {
-              if (!PrunedTree.count(Q->second))
+            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+                 TE = SS->second.end(); T != TE; ++T) {
+              VPPair Q(*S, *T);
+              if (!PrunedDAG.count(Q.second))
                 continue;
               DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q->second, Q->first));
+                PairConnectionTypes.find(VPPair(Q.second, Q.first));
               assert(R != PairConnectionTypes.end() &&
                      "Cannot find pair connection type");
               if (R->second == PairConnectionDirect)
@@ -1802,24 +1851,35 @@ namespace {
               ((NumDepsSwap > NumDepsDirect) ||
                 FixedOrderPairs.count(ValuePair(S->second, S->first)));
 
-            for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-                 Q != IP.second; ++Q) {
-              if (!PrunedTree.count(Q->second))
+            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+                 TE = SS->second.end(); T != TE; ++T) {
+              VPPair Q(*S, *T);
+              if (!PrunedDAG.count(Q.second))
                 continue;
               DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q->second, Q->first));
+                PairConnectionTypes.find(VPPair(Q.second, Q.first));
               assert(R != PairConnectionTypes.end() &&
                      "Cannot find pair connection type");
-              Type *Ty1 = Q->second.first->getType(),
-                   *Ty2 = Q->second.second->getType();
+              Type *Ty1 = Q.second.first->getType(),
+                   *Ty2 = Q.second.second->getType();
               Type *VTy = getVecTypeForPair(Ty1, Ty2);
               if ((R->second == PairConnectionDirect && FlipOrder) ||
                   (R->second == PairConnectionSwap && !FlipOrder)  ||
                   R->second == PairConnectionSplat) {
                 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                    VTy, VTy);
+
+                if (VTy->getVectorNumElements() == 2) {
+                  if (R->second == PairConnectionSplat)
+                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                      TargetTransformInfo::SK_Broadcast, VTy));
+                  else
+                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                      TargetTransformInfo::SK_Reverse, VTy));
+                }
+
                 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                  *Q->second.first << " <-> " << *Q->second.second <<
+                  *Q.second.first << " <-> " << *Q.second.second <<
                     "} -> {" <<
                   *S->first << " <-> " << *S->second << "} = " <<
                    ESContrib << "\n");
@@ -1846,7 +1906,7 @@ namespace {
               }
               if (isa<ExtractElementInst>(*I))
                 continue;
-              if (PrunedTreeInstrs.count(*I))
+              if (PrunedDAGInstrs.count(*I))
                 continue;
               NeedsExtraction = true;
               break;
@@ -1854,10 +1914,12 @@ namespace {
 
             if (NeedsExtraction) {
               int ESContrib;
-              if (Ty1->isVectorTy())
+              if (Ty1->isVectorTy()) {
                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                Ty1, VTy);
-              else
+                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
+              } else
                 ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::ExtractElement, VTy, 0);
 
@@ -1876,7 +1938,7 @@ namespace {
               }
               if (isa<ExtractElementInst>(*I))
                 continue;
-              if (PrunedTreeInstrs.count(*I))
+              if (PrunedDAGInstrs.count(*I))
                 continue;
               NeedsExtraction = true;
               break;
@@ -1884,10 +1946,13 @@ namespace {
 
             if (NeedsExtraction) {
               int ESContrib;
-              if (Ty2->isVectorTy())
+              if (Ty2->isVectorTy()) {
                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                Ty2, VTy);
-              else
+                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                  TargetTransformInfo::SK_ExtractSubvector, VTy,
+                  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
+              } else
                 ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::ExtractElement, VTy, 1);
               DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
@@ -1915,7 +1980,7 @@ namespace {
               ValuePair VPR = ValuePair(O2, O1);
 
               // Internal edges are not handled here.
-              if (PrunedTree.count(VP) || PrunedTree.count(VPR))
+              if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
                 continue;
 
               Type *Ty1 = O1->getType(),
@@ -1963,6 +2028,10 @@ namespace {
               } else if (IncomingPairs.count(VPR)) {
                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                VTy, VTy);
+
+                if (VTy->getVectorNumElements() == 2)
+                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                    TargetTransformInfo::SK_Reverse, VTy));
               } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
                 ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::InsertElement, VTy, 0);
@@ -2005,27 +2074,27 @@ namespace {
 
         if (!HasNontrivialInsts) {
           DEBUG(if (DebugPairSelection) dbgs() <<
-                "\tNo non-trivial instructions in tree;"
+                "\tNo non-trivial instructions in DAG;"
                 " override to zero effective size\n");
           EffSize = 0;
         }
       } else {
-        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-             E = PrunedTree.end(); S != E; ++S)
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S)
           EffSize += (int) getDepthFactor(S->first);
       }
 
       DEBUG(if (DebugPairSelection)
-             dbgs() << "BBV: found pruned Tree for pair {"
-             << *J->first << " <-> " << *J->second << "} of depth " <<
-             MaxDepth << " and size " << PrunedTree.size() <<
+             dbgs() << "BBV: found pruned DAG for pair {"
+             << IJ.first << " <-> " << IJ.second << "} of depth " <<
+             MaxDepth << " and size " << PrunedDAG.size() <<
             " (effective size: " << EffSize << ")\n");
       if (((TTI && !UseChainDepthWithTI) ||
             MaxDepth >= Config.ReqChainDepth) &&
           EffSize > 0 && EffSize > BestEffSize) {
         BestMaxDepth = MaxDepth;
         BestEffSize = EffSize;
-        BestTree = PrunedTree;
+        BestDAG = PrunedDAG;
       }
     }
   }
@@ -2033,66 +2102,98 @@ namespace {
   // Given the list of candidate pairs, this function selects those
   // that will be fused into vector instructions.
   void BBVectorize::choosePairs(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &FixedOrderPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      DenseMap<Value *, Value *>& ChosenPairs) {
+                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                DenseSet<ValuePair> &CandidatePairsSet,
+                DenseMap<ValuePair, int> &CandidatePairCostSavings,
+                std::vector<Value *> &PairableInsts,
+                DenseSet<ValuePair> &FixedOrderPairs,
+                DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+                DenseSet<ValuePair> &PairableInstUsers,
+                DenseMap<Value *, Value *>& ChosenPairs) {
     bool UseCycleCheck =
-     CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck;
-    std::multimap<ValuePair, ValuePair> PairableInstUserMap;
+     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
+
+    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
+    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
+         E = CandidatePairsSet.end(); I != E; ++I) {
+      std::vector<Value *> &JJ = CandidatePairs2[I->second];
+      if (JJ.empty()) JJ.reserve(32);
+      JJ.push_back(I->first);
+    }
+
+    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
+    DenseSet<VPPair> PairableInstUserPairSet;
     for (std::vector<Value *>::iterator I = PairableInsts.begin(),
          E = PairableInsts.end(); I != E; ++I) {
       // The number of possible pairings for this variable:
-      size_t NumChoices = CandidatePairs.count(*I);
+      size_t NumChoices = CandidatePairs.lookup(*I).size();
       if (!NumChoices) continue;
 
-      VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
+      std::vector<Value *> &JJ = CandidatePairs[*I];
 
-      // The best pair to choose and its tree:
+      // The best pair to choose and its dag:
       size_t BestMaxDepth = 0;
       int BestEffSize = 0;
-      DenseSet<ValuePair> BestTree;
-      findBestTreeFor(CandidatePairs, CandidatePairCostSavings,
+      DenseSet<ValuePair> BestDAG;
+      findBestDAGFor(CandidatePairs, CandidatePairsSet,
+                      CandidatePairCostSavings,
                       PairableInsts, FixedOrderPairs, PairConnectionTypes,
                       ConnectedPairs, ConnectedPairDeps,
-                      PairableInstUsers, PairableInstUserMap, ChosenPairs,
-                      BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
+                      PairableInstUsers, PairableInstUserMap,
+                      PairableInstUserPairSet, ChosenPairs,
+                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
                       UseCycleCheck);
 
-      // A tree has been chosen (or not) at this point. If no tree was
+      if (BestDAG.empty())
+        continue;
+
+      // A dag has been chosen (or not) at this point. If no dag was
       // chosen, then this instruction, I, cannot be paired (and is no longer
       // considered).
 
-      DEBUG(if (BestTree.size() > 0)
-              dbgs() << "BBV: selected pairs in the best tree for: "
-                     << *cast<Instruction>(*I) << "\n");
+      DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
+                   << *cast<Instruction>(*I) << "\n");
 
-      for (DenseSet<ValuePair>::iterator S = BestTree.begin(),
-           SE2 = BestTree.end(); S != SE2; ++S) {
-        // Insert the members of this tree into the list of chosen pairs.
+      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
+           SE2 = BestDAG.end(); S != SE2; ++S) {
+        // Insert the members of this dag into the list of chosen pairs.
         ChosenPairs.insert(ValuePair(S->first, S->second));
         DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
                *S->second << "\n");
 
-        // Remove all candidate pairs that have values in the chosen tree.
-        for (std::multimap<Value *, Value *>::iterator K =
-               CandidatePairs.begin(); K != CandidatePairs.end();) {
-          if (K->first == S->first || K->second == S->first ||
-              K->second == S->second || K->first == S->second) {
-            // Don't remove the actual pair chosen so that it can be used
-            // in subsequent tree selections.
-            if (!(K->first == S->first && K->second == S->second))
-              CandidatePairs.erase(K++);
-            else
-              ++K;
-          } else {
-            ++K;
-          }
+        // Remove all candidate pairs that have values in the chosen dag.
+        std::vector<Value *> &KK = CandidatePairs[S->first];
+        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
+             K != KE; ++K) {
+          if (*K == S->second)
+            continue;
+
+          CandidatePairsSet.erase(ValuePair(S->first, *K));
+        }
+
+        std::vector<Value *> &LL = CandidatePairs2[S->second];
+        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
+             L != LE; ++L) {
+          if (*L == S->first)
+            continue;
+
+          CandidatePairsSet.erase(ValuePair(*L, S->second));
+        }
+
+        std::vector<Value *> &MM = CandidatePairs[S->second];
+        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
+             M != ME; ++M) {
+          assert(*M != S->first && "Flipped pair in candidate list?");
+          CandidatePairsSet.erase(ValuePair(S->second, *M));
+        }
+
+        std::vector<Value *> &NN = CandidatePairs2[S->first];
+        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
+             N != NE; ++N) {
+          assert(*N != S->second && "Flipped pair in candidate list?");
+          CandidatePairsSet.erase(ValuePair(*N, S->first));
         }
       }
     }
@@ -2696,7 +2797,7 @@ namespace {
 
   // Move all uses of the function I (including pairing-induced uses) after J.
   bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I, Instruction *J) {
     // Skip to the first instruction past I.
     BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
@@ -2704,18 +2805,18 @@ namespace {
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
     for (; cast<Instruction>(L) != J; ++L)
-      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet);
+      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
 
     assert(cast<Instruction>(L) == J &&
       "Tracking has not proceeded far enough to check for dependencies");
     // If J is now in the use set of I, then trackUsesOfI will return true
     // and we have a dependency cycle (and the fusing operation must abort).
-    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet);
+    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
   }
 
   // Move all uses of the function I (including pairing-induced uses) after J.
   void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J) {
     // Skip to the first instruction past I.
@@ -2724,7 +2825,7 @@ namespace {
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
     for (; cast<Instruction>(L) != J;) {
-      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
+      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
         // Move this instruction
         Instruction *InstToMove = L; ++L;
 
@@ -2744,7 +2845,8 @@ namespace {
   // to be moved after J (the second instruction) when the pair is fused.
   void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I) {
     // Skip to the first instruction past I.
     BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
@@ -2757,8 +2859,10 @@ namespace {
     // could be before I if this is an inverted input.
     for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
       if (trackUsesOfI(Users, WriteSet, I, L)) {
-        if (L->mayReadFromMemory())
-          LoadMoveSet.insert(ValuePair(L, I));
+        if (L->mayReadFromMemory()) {
+          LoadMoveSet[L].push_back(I);
+          LoadMoveSetPairs.insert(ValuePair(L, I));
+        }
       }
     }
   }
@@ -2767,20 +2871,22 @@ namespace {
   // are chosen for vectorization, we can end up in a situation where the
   // aliasing analysis starts returning different query results as the
   // process of fusing instruction pairs continues. Because the algorithm
-  // relies on finding the same use trees here as were found earlier, we'll
+  // relies on finding the same use dags here as were found earlier, we'll
   // need to precompute the necessary aliasing information here and then
   // manually update it during the fusion process.
   void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet) {
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs) {
     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
          PIE = PairableInsts.end(); PI != PIE; ++PI) {
       DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
       if (P == ChosenPairs.end()) continue;
 
       Instruction *I = cast<Instruction>(P->first);
-      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
+      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
+                             LoadMoveSetPairs, I);
     }
   }
 
@@ -2816,12 +2922,12 @@ namespace {
   // because the vector instruction is inserted in the location of the pair's
   // second member).
   void BBVectorize::fuseChosenPairs(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs,
-                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) {
+             std::vector<Value *> &PairableInsts,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
     LLVMContext& Context = BB.getContext();
 
     // During the vectorization process, the order of the pairs to be fused
@@ -2835,8 +2941,10 @@ namespace {
          E = FlippedPairs.end(); P != E; ++P)
       ChosenPairs.insert(*P);
 
-    std::multimap<Value *, Value *> LoadMoveSet;
-    collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
+    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
+    DenseSet<ValuePair> LoadMoveSetPairs;
+    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
+                       LoadMoveSet, LoadMoveSetPairs);
 
     DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
 
@@ -2868,7 +2976,7 @@ namespace {
       ChosenPairs.erase(FP);
       ChosenPairs.erase(P);
 
-      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) {
+      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
         DEBUG(dbgs() << "BBV: fusion of: " << *I <<
                " <-> " << *J <<
                " aborted because of non-trivial dependency cycle\n");
@@ -2885,18 +2993,20 @@ namespace {
         // of dependencies connected via swaps, and those directly connected,
         // and flip the order if the number of swaps is greater.
         bool OrigOrder = true;
-        VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J));
-        if (IP.first == ConnectedPairDeps.end()) {
-          IP = ConnectedPairDeps.equal_range(ValuePair(J, I));
+        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
+          ConnectedPairDeps.find(ValuePair(I, J));
+        if (IJ == ConnectedPairDeps.end()) {
+          IJ = ConnectedPairDeps.find(ValuePair(J, I));
           OrigOrder = false;
         }
 
-        if (IP.first != ConnectedPairDeps.end()) {
+        if (IJ != ConnectedPairDeps.end()) {
           unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-          for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-               Q != IP.second; ++Q) {
+          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
+               TE = IJ->second.end(); T != TE; ++T) {
+            VPPair Q(IJ->first, *T);
             DenseMap<VPPair, unsigned>::iterator R =
-              PairConnectionTypes.find(VPPair(Q->second, Q->first));
+              PairConnectionTypes.find(VPPair(Q.second, Q.first));
             assert(R != PairConnectionTypes.end() &&
                    "Cannot find pair connection type");
             if (R->second == PairConnectionDirect)
@@ -2922,17 +3032,20 @@ namespace {
 
       // If the pair being fused uses the opposite order from that in the pair
       // connection map, then we need to flip the types.
-      VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L));
-      for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-           Q != IP.second; ++Q) {
-        DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q);
-        assert(R != PairConnectionTypes.end() &&
-               "Cannot find pair connection type");
-        if (R->second == PairConnectionDirect)
-          R->second = PairConnectionSwap;
-        else if (R->second == PairConnectionSwap)
-          R->second = PairConnectionDirect;
-      }
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
+        ConnectedPairs.find(ValuePair(H, L));
+      if (HL != ConnectedPairs.end())
+        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
+             TE = HL->second.end(); T != TE; ++T) {
+          VPPair Q(HL->first, *T);
+          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
+          assert(R != PairConnectionTypes.end() &&
+                 "Cannot find pair connection type");
+          if (R->second == PairConnectionDirect)
+            R->second = PairConnectionSwap;
+          else if (R->second == PairConnectionSwap)
+            R->second = PairConnectionDirect;
+        }
 
       bool LBeforeH = !FlipPairOrder;
       unsigned NumOperands = I->getNumOperands();
@@ -2964,12 +3077,12 @@ namespace {
       Instruction *K1 = 0, *K2 = 0;
       replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
 
-      // The use tree of the first original instruction must be moved to after
-      // the location of the second instruction. The entire use tree of the
-      // first instruction is disjoint from the input tree of the second
+      // The use dag of the first original instruction must be moved to after
+      // the location of the second instruction. The entire use dag of the
+      // first instruction is disjoint from the input dag of the second
       // (by definition), and so commutes with it.
 
-      moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
+      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
 
       if (!isa<StoreInst>(I)) {
         L->replaceAllUsesWith(K1);
@@ -2986,17 +3099,23 @@ namespace {
       // yet-to-be-fused pair. The loads in question are the keys of the map.
       if (I->mayReadFromMemory()) {
         std::vector<ValuePair> NewSetMembers;
-        VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
-        VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
-        for (std::multimap<Value *, Value *>::iterator N = IPairRange.first;
-             N != IPairRange.second; ++N)
-          NewSetMembers.push_back(ValuePair(K, N->second));
-        for (std::multimap<Value *, Value *>::iterator N = JPairRange.first;
-             N != JPairRange.second; ++N)
-          NewSetMembers.push_back(ValuePair(K, N->second));
+        DenseMap<Value *, std::vector<Value *> >::iterator II =
+          LoadMoveSet.find(I);
+        if (II != LoadMoveSet.end())
+          for (std::vector<Value *>::iterator N = II->second.begin(),
+               NE = II->second.end(); N != NE; ++N)
+            NewSetMembers.push_back(ValuePair(K, *N));
+        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
+          LoadMoveSet.find(J);
+        if (JJ != LoadMoveSet.end())
+          for (std::vector<Value *>::iterator N = JJ->second.begin(),
+               NE = JJ->second.end(); N != NE; ++N)
+            NewSetMembers.push_back(ValuePair(K, *N));
         for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
-             AE = NewSetMembers.end(); A != AE; ++A)
-          LoadMoveSet.insert(*A);
+             AE = NewSetMembers.end(); A != AE; ++A) {
+          LoadMoveSet[A->first].push_back(A->second);
+          LoadMoveSetPairs.insert(*A);
+        }
       }
 
       // Before removing I, set the iterator to the next instruction.
@@ -3056,6 +3175,7 @@ VectorizeConfig::VectorizeConfig() {
   MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
   SplatBreaksChain = ::SplatBreaksChain;
   MaxInsts = ::MaxInsts;
+  MaxPairs = ::MaxPairs;
   MaxIter = ::MaxIter;
   Pow2LenOnly = ::Pow2LenOnly;
   NoMemOpBoost = ::NoMemOpBoost;
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9c82cb8..f489393 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9,10 +9,10 @@
 //
 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
 // and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizes uses (will use) the codegen
+// in the codegen. However, the vectorizer uses (will use) the codegen
 // interfaces to generate IR that is likely to result in an optimal binary.
 //
-// The loop vectorizer combines consecutive loop iteration into a single
+// The loop vectorizer combines consecutive loop iterations into a single
 // 'wide' iteration. After this transformation the index is incremented
 // by the SIMD vector width, and not by one.
 //
@@ -32,7 +32,7 @@
 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
 //
 // Variable uniformity checks are inspired by:
-// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
 //
 // Other ideas/concepts are from:
 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
@@ -101,24 +101,20 @@ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
 /// We don't vectorize loops with a known constant trip count below this number.
-static const unsigned TinyTripCountVectorThreshold = 16;
+static cl::opt<unsigned>
+TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
+                             cl::Hidden,
+                             cl::desc("Don't vectorize loops with a constant "
+                                      "trip count that is smaller than this "
+                                      "value."));
 
 /// We don't unroll loops with a known constant trip count below this number.
 static const unsigned TinyTripCountUnrollThreshold = 128;
 
-/// We don't unroll loops that are larget than this threshold.
-static const unsigned MaxLoopSizeThreshold = 32;
-
 /// When performing a runtime memory check, do not check more than this
 /// number of pointers. Notice that the check is quadratic!
 static const unsigned RuntimeMemoryCheckThreshold = 4;
 
-/// This is the highest vector width that we try to generate.
-static const unsigned MaxVectorSize = 8;
-
-/// This is the highest Unroll Factor.
-static const unsigned MaxUnrollSize = 4;
-
 namespace {
 
 // Forward declarations.
@@ -169,8 +165,8 @@ private:
 
   /// Add code that checks at runtime if the accessed arrays overlap.
   /// Returns the comparator value or NULL if no check is needed.
-  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
-                         Instruction *Loc);
+  Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                               Instruction *Loc);
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
@@ -196,6 +192,10 @@ private:
   /// of scalars.
   void scalarizeInstruction(Instruction *Instr);
 
+  /// Vectorize Load and Store instructions,
+  void vectorizeMemoryInstruction(Instruction *Instr,
+                                  LoopVectorizationLegality *Legal);
+
   /// Create a broadcast instruction. This method generates a broadcast
   /// instruction (shuffle) for loop invariant values and for the induction
   /// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -228,31 +228,34 @@ private:
     ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
 
     /// \return True if 'Key' is saved in the Value Map.
-    bool has(Value *Key) { return MapStoreage.count(Key); }
+    bool has(Value *Key) const { return MapStorage.count(Key); }
 
     /// Initializes a new entry in the map. Sets all of the vector parts to the
     /// save value in 'Val'.
     /// \return A reference to a vector with splat values.
     VectorParts &splat(Value *Key, Value *Val) {
-      MapStoreage[Key].clear();
-      MapStoreage[Key].append(UF, Val);
-      return MapStoreage[Key];
+      VectorParts &Entry = MapStorage[Key];
+      Entry.assign(UF, Val);
+      return Entry;
     }
 
     ///\return A reference to the value that is stored at 'Key'.
     VectorParts &get(Value *Key) {
-      if (!has(Key))
-        MapStoreage[Key].resize(UF);
-      return MapStoreage[Key];
+      VectorParts &Entry = MapStorage[Key];
+      if (Entry.empty())
+        Entry.resize(UF);
+      assert(Entry.size() == UF);
+      return Entry;
     }
 
+  private:
     /// The unroll factor. Each entry in the map stores this number of vector
     /// elements.
     unsigned UF;
 
     /// Map storage. We use std::map and not DenseMap because insertions to a
     /// dense map invalidates its iterators.
-    std::map<Value*, VectorParts> MapStoreage;
+    std::map<Value *, VectorParts> MapStorage;
   };
 
   /// The original loop.
@@ -289,8 +292,8 @@ private:
   BasicBlock *LoopVectorBody;
   ///The scalar loop body.
   BasicBlock *LoopScalarBody;
-  ///The first bypass block.
-  BasicBlock *LoopBypassBlock;
+  /// A list of all bypass blocks. The first block is the entry of the loop.
+  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 
   /// The new Induction variable which was added to the new block.
   PHINode *Induction;
@@ -316,8 +319,9 @@ private:
 class LoopVectorizationLegality {
 public:
   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
-                            DominatorTree *DT)
-      : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {}
+                            DominatorTree *DT, TargetTransformInfo* TTI,
+                            AliasAnalysis* AA)
+      : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), Induction(0) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -336,7 +340,8 @@ public:
     IK_NoInduction,         ///< Not an induction variable.
     IK_IntInduction,        ///< Integer induction variable. Step = 1.
     IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
-    IK_PtrInduction         ///< Pointer induction variable. Step = sizeof(elem).
+    IK_PtrInduction,        ///< Pointer induction var. Step = sizeof(elem).
+    IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
   };
 
   /// This POD struct holds information about reduction variables.
@@ -400,6 +405,11 @@ public:
   /// induction descriptor.
   typedef MapVector<PHINode*, InductionInfo> InductionList;
 
+  /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
+  /// respective Store/Load instruction(s) to calculate aliasing.
+  typedef DenseMap<Value*, Instruction* > AliasMap;
+  typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -473,6 +483,14 @@ private:
   InductionKind isInductionVariable(PHINode *Phi);
   /// Return true if can compute the address bounds of Ptr within the loop.
   bool hasComputableBounds(Value *Ptr);
+  /// Return true if there is the chance of write reorder.
+  bool hasPossibleGlobalWriteReorder(Value *Object,
+                                     Instruction *Inst,
+                                     AliasMultiMap &WriteObjects,
+                                     unsigned MaxByteWidth);
+  /// Return the AA location for a load or a store.
+  AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
+
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -480,8 +498,12 @@ private:
   ScalarEvolution *SE;
   /// DataLayout analysis.
   DataLayout *DL;
-  // Dominators.
+  /// Dominators.
   DominatorTree *DT;
+  /// Target Info.
+  TargetTransformInfo *TTI;
+  /// Alias Analysis.
+  AliasAnalysis *AA;
 
   //  ---  vectorization state --- //
 
@@ -517,20 +539,34 @@ class LoopVectorizationCostModel {
 public:
   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
                              LoopVectorizationLegality *Legal,
-                             const TargetTransformInfo &TTI)
-      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {}
-
-  /// \return The most profitable vectorization factor.
+                             const TargetTransformInfo &TTI,
+                             DataLayout *DL)
+      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL) {}
+
+  /// Information about vectorization costs
+  struct VectorizationFactor {
+    unsigned Width; // Vector width with best cost
+    unsigned Cost; // Cost of the loop with that width
+  };
+  /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to VF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
-  unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
+  VectorizationFactor selectVectorizationFactor(bool OptForSize,
+                                                unsigned UserVF);
 
+  /// \return The size (in bits) of the widest type in the code that
+  /// needs to be vectorized. We ignore values that remain scalar such as
+  /// 64 bit loop indices.
+  unsigned getWidestType();
 
   /// \return The most profitable unroll factor.
   /// If UserUF is non-zero then this method finds the best unroll-factor
   /// based on register pressure and other parameters.
-  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
+  /// VF and LoopCost are the selected vectorization factor and the cost of the
+  /// selected VF.
+  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
+                              unsigned LoopCost);
 
   /// \brief A struct that represents some properties of the register usage
   /// of a loop.
@@ -562,6 +598,10 @@ private:
   /// the scalar type.
   static Type* ToVectorTy(Type *Scalar, unsigned VF);
 
+  /// Returns whether the instruction is a load or store and will be a emitted
+  /// as a vector operation.
+  bool isConsecutiveLoadOrStore(Instruction *I);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
@@ -572,6 +612,8 @@ private:
   LoopVectorizationLegality *Legal;
   /// Vector target information.
   const TargetTransformInfo &TTI;
+  /// Target data layout information.
+  DataLayout *DL;
 };
 
 /// The LoopVectorize Pass.
@@ -588,6 +630,7 @@ struct LoopVectorize : public LoopPass {
   LoopInfo *LI;
   TargetTransformInfo *TTI;
   DominatorTree *DT;
+  AliasAnalysis *AA;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -599,21 +642,22 @@ struct LoopVectorize : public LoopPass {
     LI = &getAnalysis<LoopInfo>();
     TTI = &getAnalysis<TargetTransformInfo>();
     DT = &getAnalysis<DominatorTree>();
+    AA = getAnalysisIfAvailable<AliasAnalysis>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL, DT);
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI);
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL);
 
-    // Check the function attribues to find out if this function should be
+    // Check the function attributes to find out if this function should be
     // optimized for size.
     Function *F = L->getHeader()->getParent();
     Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
@@ -628,20 +672,24 @@ struct LoopVectorize : public LoopPass {
       return false;
     }
 
-    unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
-    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll);
+    // Select the optimal vectorization factor.
+    LoopVectorizationCostModel::VectorizationFactor VF;
+    VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+    // Select the unroll factor.
+    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
+                                        VF.Width, VF.Cost);
 
-    if (VF == 1) {
+    if (VF.Width == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
       return false;
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
           F->getParent()->getModuleIdentifier()<<"\n");
     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
 
-    // If we decided that it is *legal* to vectorizer the loop then do it.
-    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
+    // If we decided that it is *legal* to vectorize the loop then do it.
+    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF.Width, UF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -730,6 +778,9 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+  // Make sure that the pointer does not point to structs.
+  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+    return 0;
 
   // If this value is a pointer induction variable we know it is consecutive.
   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
@@ -737,6 +788,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     InductionInfo II = Inductions[Phi];
     if (IK_PtrInduction == II.IK)
       return 1;
+    else if (IK_ReversePtrInduction == II.IK)
+      return -1;
   }
 
   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
@@ -746,6 +799,29 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   unsigned NumOperands = Gep->getNumOperands();
   Value *LastIndex = Gep->getOperand(NumOperands - 1);
 
+  Value *GpPtr = Gep->getPointerOperand();
+  // If this GEP value is a consecutive pointer induction variable and all of
+  // the indices are constant then we know it is consecutive. We can
+  Phi = dyn_cast<PHINode>(GpPtr);
+  if (Phi && Inductions.count(Phi)) {
+
+    // Make sure that the pointer does not point to structs.
+    PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
+    if (GepPtrType->getElementType()->isAggregateType())
+      return 0;
+
+    // Make sure that all of the index operands are loop invariant.
+    for (unsigned i = 1; i < NumOperands; ++i)
+      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+        return 0;
+
+    InductionInfo II = Inductions[Phi];
+    if (IK_PtrInduction == II.IK)
+      return 1;
+    else if (IK_ReversePtrInduction == II.IK)
+      return -1;
+  }
+
   // Check that all of the gep indices are uniform except for the last.
   for (unsigned i = 0; i < NumOperands - 1; ++i)
     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
@@ -784,8 +860,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
   // If this scalar is unknown, assume that it is a constant or that it is
   // loop invariant. Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
-  WidenMap.splat(V, B);
-  return WidenMap.get(V);
+  return WidenMap.splat(V, B);
 }
 
 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
@@ -799,6 +874,111 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
                                      "reverse");
 }
 
+
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
+                                             LoopVectorizationLegality *Legal) {
+  // Attempt to issue a wide load.
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+  assert((LI || SI) && "Invalid Load/Store instruction");
+
+  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *DataTy = VectorType::get(ScalarDataTy, VF);
+  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+
+  // If the pointer is loop invariant or if it is non consecutive,
+  // scalarize the load.
+  int Stride = Legal->isConsecutivePtr(Ptr);
+  bool Reverse = Stride < 0;
+  bool UniformLoad = LI && Legal->isUniform(Ptr);
+  if (Stride == 0 || UniformLoad)
+    return scalarizeInstruction(Instr);
+
+  Constant *Zero = Builder.getInt32(0);
+  VectorParts &Entry = WidenMap.get(Instr);
+
+  // Handle consecutive loads/stores.
+  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+    Value *PtrOperand = Gep->getPointerOperand();
+    Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
+    FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
+
+    // Create the new GEP with the new induction variable.
+    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+    Gep2->setOperand(0, FirstBasePtr);
+    Gep2->setName("gep.indvar.base");
+    Ptr = Builder.Insert(Gep2);
+  } else if (Gep) {
+    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
+                               OrigLoop) && "Base ptr must be invariant");
+
+    // The last index does not have to be the induction. It can be
+    // consecutive and be a function of the index. For example A[I+1];
+    unsigned NumOperands = Gep->getNumOperands();
+
+    Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
+    VectorParts &GEPParts = getVectorValue(LastGepOperand);
+    Value *LastIndex = GEPParts[0];
+    LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+    // Create the new GEP with the new induction variable.
+    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+    Gep2->setOperand(NumOperands - 1, LastIndex);
+    Gep2->setName("gep.indvar.idx");
+    Ptr = Builder.Insert(Gep2);
+  } else {
+    // Use the induction element ptr.
+    assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+    VectorParts &PtrVal = getVectorValue(Ptr);
+    Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
+  }
+
+  // Handle Stores:
+  if (SI) {
+    assert(!Legal->isUniform(SI->getPointerOperand()) &&
+           "We do not allow storing to uniform addresses");
+
+    VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      // Calculate the pointer for the specific unroll-part.
+      Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+      if (Reverse) {
+        // If we store to reverse consecutive memory locations then we need
+        // to reverse the order of elements in the stored value.
+        StoredVal[Part] = reverseVector(StoredVal[Part]);
+        // If the address is consecutive but reversed, then the
+        // wide store needs to start at the last vector element.
+        PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+        PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+      }
+
+      Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+      Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
+    }
+  }
+
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // Calculate the pointer for the specific unroll-part.
+    Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+    if (Reverse) {
+      // If the address is consecutive but reversed, then the
+      // wide store needs to start at the last vector element.
+      PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+      PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+    }
+
+    Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+    Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
+    cast<LoadInst>(LI)->setAlignment(Alignment);
+    Entry[Part] = Reverse ? reverseVector(LI) :  LI;
+  }
+}
+
 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
@@ -870,7 +1050,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   }
 }
 
-Value*
+Instruction *
 InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
                                      Instruction *Loc) {
   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
@@ -879,7 +1059,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   if (!PtrRtCheck->Need)
     return NULL;
 
-  Value *MemoryRuntimeCheck = 0;
+  Instruction *MemoryRuntimeCheck = 0;
   unsigned NumPointers = PtrRtCheck->Pointers.size();
   SmallVector<Value* , 2> Starts;
   SmallVector<Value* , 2> Ends;
@@ -908,28 +1088,23 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
     }
   }
 
+  IRBuilder<> ChkBuilder(Loc);
+
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i+1; j < NumPointers; ++j) {
-      Instruction::CastOps Op = Instruction::BitCast;
-      Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc);
-      Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc);
-      Value *End0 =   CastInst::Create(Op, Ends[i],   PtrArithTy, "bc", Loc);
-      Value *End1 =   CastInst::Create(Op, Ends[j],   PtrArithTy, "bc", Loc);
-
-      Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                    Start0, End1, "bound0", Loc);
-      Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                    Start1, End0, "bound1", Loc);
-      Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
-                                                 "found.conflict", Loc);
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
+
+      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
       if (MemoryRuntimeCheck)
-        MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
-                                                    MemoryRuntimeCheck,
-                                                    IsConflict,
-                                                    "conflict.rdx", Loc);
-      else
-        MemoryRuntimeCheck = IsConflict;
+        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+                                         "conflict.rdx");
 
+      MemoryRuntimeCheck = cast<Instruction>(IsConflict);
     }
   }
 
@@ -943,7 +1118,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-       [ ] <-- vector loop bypass.
+       [ ] <-- vector loop bypass (may consist of multiple blocks).
      /  |
     /   v
    |   [ ]     <-- vector pre header.
@@ -1004,10 +1179,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   ConstantInt::get(IdxTy, 0);
 
   assert(BypassBlock && "Invalid loop structure");
-
-  // Generate the code that checks in runtime if arrays overlap.
-  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
-                                              BypassBlock->getTerminator());
+  LoopBypassBlocks.push_back(BypassBlock);
 
   // Split the single block loop into the two loop structure described above.
   BasicBlock *VectorPH =
@@ -1019,10 +1191,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   BasicBlock *ScalarPH =
   MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
 
-  // This is the location in which we add all of the logic for bypassing
-  // the new vector loop.
-  Instruction *Loc = BypassBlock->getTerminator();
-
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
@@ -1033,42 +1201,62 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // times the unroll factor (num of SIMD instructions).
   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
 
+  // This is the IR builder that we use to add all of the logic for bypassing
+  // the new vector loop.
+  IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
+
   // We may need to extend the index in case there is a type mismatch.
   // We know that the count starts at zero and does not overflow.
   if (Count->getType() != IdxTy) {
     // The exit count can be of pointer type. Convert it to the correct
     // integer type.
     if (ExitCount->getType()->isPointerTy())
-      Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
+      Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
     else
-      Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+      Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
   }
 
   // Add the start index to the loop count to get the new end index.
-  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
+  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
 
   // Now we need to generate the expression for N - (N % VF), which is
   // the part that the vectorized body will execute.
-  Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc);
-  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
-  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
-                                                     "end.idx.rnd.down", Loc);
+  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
+  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
+  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
+                                                     "end.idx.rnd.down");
 
   // Now, compare the new count to zero. If it is zero skip the vector loop and
   // jump to the scalar loop.
-  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                               IdxEndRoundDown,
-                               StartIdx,
-                               "cmp.zero", Loc);
-
-  // If we are using memory runtime checks, include them in.
-  if (MemoryRuntimeCheck)
-    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
-                                 "CntOrMem", Loc);
+  Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
+                                          "cmp.zero");
+
+  BasicBlock *LastBypassBlock = BypassBlock;
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
+                                                 BypassBlock->getTerminator());
+  if (MemRuntimeCheck) {
+    // Create a new block containing the memory check.
+    BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
+                                                          "vector.memcheck");
+    LoopBypassBlocks.push_back(CheckBlock);
+
+    // Replace the branch into the memory check block with a conditional branch
+    // for the "few elements case".
+    Instruction *OldTerm = BypassBlock->getTerminator();
+    BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
+    OldTerm->eraseFromParent();
+
+    Cmp = MemRuntimeCheck;
+    LastBypassBlock = CheckBlock;
+  }
 
-  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
-  // Remove the old terminator.
-  Loc->eraseFromParent();
+  LastBypassBlock->getTerminator()->eraseFromParent();
+  BranchInst::Create(MiddleBlock, VectorPH, Cmp,
+                     LastBypassBlock);
 
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
@@ -1108,30 +1296,45 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
       Value *CRD = CountRoundDown;
       if (CRDSize > IISize)
         CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
-                               II.StartValue->getType(),
-                               "tr.crd", BypassBlock->getTerminator());
+                               II.StartValue->getType(), "tr.crd",
+                               LoopBypassBlocks.back()->getTerminator());
       else if (CRDSize < IISize)
         CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
                                II.StartValue->getType(),
-                               "sext.crd", BypassBlock->getTerminator());
+                               "sext.crd",
+                               LoopBypassBlocks.back()->getTerminator());
       // Handle reverse integer induction counter:
-      EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
-                                           BypassBlock->getTerminator());
+      EndValue =
+        BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
       break;
     }
     case LoopVectorizationLegality::IK_PtrInduction: {
       // For pointer induction variables, calculate the offset using
       // the end index.
-      EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown,
-                                           "ptr.ind.end",
-                                           BypassBlock->getTerminator());
+      EndValue =
+        GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      break;
+    }
+    case LoopVectorizationLegality::IK_ReversePtrInduction: {
+      // The value at the end of the loop for the reverse pointer is calculated
+      // by creating a GEP with a negative index starting from the start value.
+      Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
+      Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown,
+                                  "rev.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx,
+                                  "rev.ptr.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
       break;
     }
     }// end of case
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    ResumeVal->addIncoming(II.StartValue, BypassBlock);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
     ResumeVal->addIncoming(EndValue, VecBody);
 
     // Fix the scalar body counter (PHI node).
@@ -1147,7 +1350,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
     assert(!ResumeIndex && "Unexpected resume value found");
     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
                                   MiddleBlock->getTerminator());
-    ResumeIndex->addIncoming(StartIdx, BypassBlock);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
   }
 
@@ -1187,6 +1391,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Insert the new loop into the loop nest and register the new basic blocks.
   if (ParentLoop) {
     ParentLoop->addChildLoop(Lp);
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+      ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
@@ -1203,7 +1409,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
-  LoopBypassBlock = BypassBlock;
 }
 
 /// This function returns the identity element (or neutral element) for
@@ -1295,9 +1500,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // the cost-model.
   //
   //===------------------------------------------------===//
-  BasicBlock &BB = *OrigLoop->getHeader();
-  Constant *Zero =
-  ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0);
+  Constant *Zero = Builder.getInt32(0);
 
   // In order to support reduction variables we need to be able to vectorize
   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
@@ -1343,7 +1546,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // To do so, we need to generate the 'identity' vector and overide
     // one of the elements with the incoming scalar reduction. We need
     // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+    Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
 
     // This is the vector-clone of the value that leaves the loop.
     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@@ -1391,7 +1594,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
       Value *StartVal = (part == 0) ? VectorStart : Identity;
-      NewPhi->addIncoming(StartVal, LoopBypassBlock);
+      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
       NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
       RdxParts.push_back(NewPhi);
     }
@@ -1533,8 +1737,6 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
 void
 InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
                                           BasicBlock *BB, PhiVector *PV) {
-  Constant *Zero = Builder.getInt32(0);
-
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     VectorParts &Entry = WidenMap.get(it);
@@ -1568,7 +1770,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // optimizations will clean it up.
         VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
                                                P->getParent());
-        
+
         for (unsigned part = 0; part < UF; ++part) {
         VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
         VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
@@ -1600,6 +1802,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       }
       case LoopVectorizationLegality::IK_ReverseIntInduction:
       case LoopVectorizationLegality::IK_PtrInduction:
+      case LoopVectorizationLegality::IK_ReversePtrInduction:
         // Handle reverse integer and pointer inductions.
         Value *StartIdx = 0;
         // If we have a single integer induction variable then use it.
@@ -1635,15 +1838,23 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // Handle the pointer induction variable case.
         assert(P->getType()->isPointerTy() && "Unexpected type.");
 
+        // Is this a reverse induction ptr or a consecutive induction ptr.
+        bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
+                        II.IK);
+
         // This is the vector of results. Notice that we don't generate
         // vector geps because scalar geps result in better code.
         for (unsigned part = 0; part < UF; ++part) {
           Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
           for (unsigned int i = 0; i < VF; ++i) {
-            Constant *Idx = ConstantInt::get(Induction->getType(),
-                                             i + part * VF);
-            Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
-                                                 "gep.idx");
+            int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
+            Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+            Value *GlobalIdx;
+            if (!Reverse)
+              GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+            else
+              GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+
             Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
                                                "next.gep");
             VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
@@ -1684,13 +1895,13 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       for (unsigned Part = 0; Part < UF; ++Part) {
         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
 
-        // Update the NSW, NUW and Exact flags.
-        BinaryOperator *VecOp = cast<BinaryOperator>(V);
-        if (isa<OverflowingBinaryOperator>(BinOp)) {
+        // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
+        BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
+        if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
           VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
           VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
         }
-        if (isa<PossiblyExactOperator>(VecOp))
+        if (VecOp && isa<PossiblyExactOperator>(VecOp))
           VecOp->setIsExact(BinOp->isExact());
 
         Entry[Part] = V;
@@ -1740,124 +1951,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       break;
     }
 
-    case Instruction::Store: {
-      // Attempt to issue a wide store.
-      StoreInst *SI = dyn_cast<StoreInst>(it);
-      Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
-      Value *Ptr = SI->getPointerOperand();
-      unsigned Alignment = SI->getAlignment();
-
-      assert(!Legal->isUniform(Ptr) &&
-             "We do not allow storing to uniform addresses");
-
-
-      int Stride = Legal->isConsecutivePtr(Ptr);
-      bool Reverse = Stride < 0;
-      if (Stride == 0) {
-        scalarizeInstruction(it);
+    case Instruction::Store:
+    case Instruction::Load:
+        vectorizeMemoryInstruction(it, Legal);
         break;
-      }
-
-      // Handle consecutive stores.
-
-      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-      if (Gep) {
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-
-        Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
-        VectorParts &GEPParts = getVectorValue(LastGepOperand);
-        Value *LastIndex = GEPParts[0];
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
-      } else {
-        // Use the induction element ptr.
-        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-        VectorParts &PtrVal = getVectorValue(Ptr);
-        Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
-      }
-
-      VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        // Calculate the pointer for the specific unroll-part.
-        Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
-
-        if (Reverse) {
-          // If we store to reverse consecutive memory locations then we need
-          // to reverse the order of elements in the stored value.
-          StoredVal[Part] = reverseVector(StoredVal[Part]);
-          // If the address is consecutive but reversed, then the
-          // wide store needs to start at the last vector element.
-          PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
-          PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
-        }
-
-        Value *VecPtr = Builder.CreateBitCast(PartPtr, StTy->getPointerTo());
-        Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
-      }
-      break;
-    }
-    case Instruction::Load: {
-      // Attempt to issue a wide load.
-      LoadInst *LI = dyn_cast<LoadInst>(it);
-      Type *RetTy = VectorType::get(LI->getType(), VF);
-      Value *Ptr = LI->getPointerOperand();
-      unsigned Alignment = LI->getAlignment();
-
-      // If the pointer is loop invariant or if it is non consecutive,
-      // scalarize the load.
-      int Stride = Legal->isConsecutivePtr(Ptr);
-      bool Reverse = Stride < 0;
-      if (Legal->isUniform(Ptr) || Stride == 0) {
-        scalarizeInstruction(it);
-        break;
-      }
-
-      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-      if (Gep) {
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-
-        Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
-        VectorParts &GEPParts = getVectorValue(LastGepOperand);
-        Value *LastIndex = GEPParts[0];
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
-      } else {
-        // Use the induction element ptr.
-        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-        VectorParts &PtrVal = getVectorValue(Ptr);
-        Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
-      }
-
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        // Calculate the pointer for the specific unroll-part.
-        Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
-
-        if (Reverse) {
-          // If the address is consecutive but reversed, then the
-          // wide store needs to start at the last vector element.
-          PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
-          PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
-        }
-
-        Value *VecPtr = Builder.CreateBitCast(PartPtr, RetTy->getPointerTo());
-        Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
-        cast<LoadInst>(LI)->setAlignment(Alignment);
-        Entry[Part] = Reverse ? reverseVector(LI) :  LI;
-      }
-      break;
-    }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -1924,12 +2021,14 @@ void InnerLoopVectorizer::updateAnalysis() {
   SE->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
-  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
          "Entry does not dominate exit.");
 
-  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+    DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
+  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
   DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
   DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -2196,7 +2295,51 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   }
 }
 
+AliasAnalysis::Location
+LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
+  if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+    return AA->getLocation(Store);
+  else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+    return AA->getLocation(Load);
+
+  llvm_unreachable("Should be either load or store instruction");
+}
+
+bool
+LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
+                                                Value *Object,
+                                                Instruction *Inst,
+                                                AliasMultiMap& WriteObjects,
+                                                unsigned MaxByteWidth) {
+
+  AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
+
+  std::vector<Instruction*>::iterator
+              it = WriteObjects[Object].begin(),
+              end = WriteObjects[Object].end();
+
+  for (; it != end; ++it) {
+    Instruction* I = *it;
+    if (I == Inst)
+      continue;
+
+    AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
+    if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
+                  ThatLoc.getWithNewSize(MaxByteWidth)))
+      return true;
+  }
+  return false;
+}
+
 bool LoopVectorizationLegality::canVectorizeMemory() {
+
+  if (TheLoop->isAnnotatedParallel()) {
+    DEBUG(dbgs()
+          << "LV: A loop annotated parallel, ignore memory dependency "
+          << "checks.\n");
+    return true;
+  }
+
   typedef SmallVector<Value*, 16> ValueVector;
   typedef SmallPtrSet<Value*, 16> ValueSet;
   // Holds the Load and Store *instructions*.
@@ -2250,9 +2393,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return true;
   }
 
-  // Holds the read and read-write *pointers* that we find.
-  ValueVector Reads;
-  ValueVector ReadWrites;
+  // Holds the read and read-write *pointers* that we find. These maps hold
+  // unique values for pointers (so no need for multi-map).
+  AliasMap Reads;
+  AliasMap ReadWrites;
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -2274,7 +2418,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     // If we did *not* see this pointer before, insert it to
     // the read-write list. At this phase it is only a 'write' list.
     if (Seen.insert(Ptr))
-      ReadWrites.push_back(Ptr);
+      ReadWrites.insert(std::make_pair(Ptr, ST));
   }
 
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
@@ -2289,7 +2433,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
-      Reads.push_back(Ptr);
+      Reads.insert(std::make_pair(Ptr, LD));
   }
 
   // If we write (or read-write) to a single destination and there are no
@@ -2302,22 +2446,27 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
-  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
-    if (hasComputableBounds(*I)) {
-      PtrRtCheck.insert(SE, TheLoop, *I);
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+  AliasMap::iterator MI, ME;
+  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+    Value *V = (*MI).first;
+    if (hasComputableBounds(V)) {
+      PtrRtCheck.insert(SE, TheLoop, V);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
       break;
     }
-  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
-    if (hasComputableBounds(*I)) {
-      PtrRtCheck.insert(SE, TheLoop, *I);
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+  }
+  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+    Value *V = (*MI).first;
+    if (hasComputableBounds(V)) {
+      PtrRtCheck.insert(SE, TheLoop, V);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
       break;
     }
+  }
 
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
@@ -2332,47 +2481,104 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   bool NeedRTCheck = false;
 
+  // Biggest vectorized access possible, vector width * unroll factor.
+  // TODO: We're being very pessimistic here, find a way to know the
+  // real access width before getting here.
+  unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
+                           TTI->getMaximumUnrollFactor();
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
-  ValueSet WriteObjects;
+  // Note that WriteObjects duplicates the stores (indexed now by underlying
+  // objects) to avoid pointing to elements inside ReadWrites.
+  // TODO: Maybe create a new type where they can interact without duplication.
+  AliasMultiMap WriteObjects;
   ValueVector TempObjects;
 
   // Check that the read-writes do not conflict with other read-write
   // pointers.
   bool AllWritesIdentified = true;
-  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
-    GetUnderlyingObjects(*I, TempObjects, DL);
-    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
-         it != e; ++it) {
-      if (!isIdentifiedObject(*it)) {
-        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
+  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+    Value *Val = (*MI).first;
+    Instruction *Inst = (*MI).second;
+
+    GetUnderlyingObjects(Val, TempObjects, DL);
+    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+         UI != UE; ++UI) {
+      if (!isIdentifiedObject(*UI)) {
+        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
         NeedRTCheck = true;
         AllWritesIdentified = false;
       }
-      if (!WriteObjects.insert(*it)) {
+
+      // Never seen it before, can't alias.
+      if (WriteObjects[*UI].empty()) {
+        DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
+        WriteObjects[*UI].push_back(Inst);
+        continue;
+      }
+      // Direct alias found.
+      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **UI <<"\n");
+        return false;
+      }
+      DEBUG(dbgs() << "LV: Found a conflicting global value:"
+            << **UI <<"\n");
+      DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
+      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+      // If global alias, make sure they do alias.
+      if (hasPossibleGlobalWriteReorder(*UI,
+                                        Inst,
+                                        WriteObjects,
+                                        MaxByteWidth)) {
         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << **it <<"\n");
+              << *UI <<"\n");
         return false;
       }
+
+      // Didn't alias, insert into map for further reference.
+      WriteObjects[*UI].push_back(Inst);
     }
     TempObjects.clear();
   }
 
   /// Check that the reads don't conflict with the read-writes.
-  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
-    GetUnderlyingObjects(*I, TempObjects, DL);
-    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
-         it != e; ++it) {
+  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+    Value *Val = (*MI).first;
+    GetUnderlyingObjects(Val, TempObjects, DL);
+    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+         UI != UE; ++UI) {
       // If all of the writes are identified then we don't care if the read
       // pointer is identified or not.
-      if (!AllWritesIdentified && !isIdentifiedObject(*it)) {
-        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
+      if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
+        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
         NeedRTCheck = true;
       }
-      if (WriteObjects.count(*it)) {
-        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
-              << **it <<"\n");
+
+      // Never seen it before, can't alias.
+      if (WriteObjects[*UI].empty())
+        continue;
+      // Direct alias found.
+      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **UI <<"\n");
+        return false;
+      }
+      DEBUG(dbgs() << "LV: Found a global value:  "
+            << **UI <<"\n");
+      Instruction *Inst = (*MI).second;
+      DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
+      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+      // If global alias, make sure they do alias.
+      if (hasPossibleGlobalWriteReorder(*UI,
+                                        Inst,
+                                        WriteObjects,
+                                        MaxByteWidth)) {
+        DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
+              << *UI <<"\n");
         return false;
       }
     }
@@ -2535,7 +2741,7 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
     return IK_NoInduction;
 
-  // Check that the PHI is consecutive and starts at zero.
+  // Check that the PHI is consecutive.
   const SCEV *PhiScev = SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
   if (!AR) {
@@ -2562,6 +2768,8 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
   if (C->getValue()->equalsInt(Size))
     return IK_PtrInduction;
+  else if (C->getValue()->equalsInt(0 - Size))
+    return IK_ReversePtrInduction;
 
   return IK_NoInduction;
 }
@@ -2612,18 +2820,34 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
   return AR->isAffine();
 }
 
-unsigned
+LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
                                                       unsigned UserVF) {
+  // Width 1 means no vectorize
+  VectorizationFactor Factor = { 1U, 0U };
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
-    return 1;
+    return Factor;
   }
 
   // Find the trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
   DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
 
+  unsigned WidestType = getWidestType();
+  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+  unsigned MaxVectorSize = WidestRegister / WidestType;
+  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+
+  if (MaxVectorSize == 0) {
+    DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+    MaxVectorSize = 1;
+  }
+
+  assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
+         " into one vector!");
+
   unsigned VF = MaxVectorSize;
 
   // If we optimize the program for size, avoid creating the tail loop.
@@ -2631,7 +2855,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
     // If we are unable to calculate the trip count then don't try to vectorize.
     if (TC < 2) {
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
-      return 1;
+      return Factor;
     }
 
     // Find the maximum SIMD width that can fit within the trip count.
@@ -2644,7 +2868,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
     // zero then we require a tail.
     if (VF < 2) {
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
-      return 1;
+      return Factor;
     }
   }
 
@@ -2652,7 +2876,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
     DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
 
-    return UserVF;
+    Factor.Width = UserVF;
+    return Factor;
   }
 
   float Cost = expectedCost(1);
@@ -2672,12 +2897,70 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
   }
 
   DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
-  return Width;
+  Factor.Width = Width;
+  Factor.Cost = Width * Cost;
+  return Factor;
+}
+
+unsigned LoopVectorizationCostModel::getWidestType() {
+  unsigned MaxWidth = 8;
+
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    BasicBlock *BB = *bb;
+
+    // For each instruction in the loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      Type *T = it->getType();
+
+      // Only examine Loads, Stores and PHINodes.
+      if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
+        continue;
+
+      // Examine PHI nodes that are reduction variables.
+      if (PHINode *PN = dyn_cast<PHINode>(it))
+        if (!Legal->getReductionVars()->count(PN))
+          continue;
+
+      // Examine the stored values.
+      if (StoreInst *ST = dyn_cast<StoreInst>(it))
+        T = ST->getValueOperand()->getType();
+
+      // Ignore loaded pointer types and stored pointer types that are not
+      // consecutive. However, we do want to take consecutive stores/loads of
+      // pointer vectors into account.
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+        continue;
+
+      MaxWidth = std::max(MaxWidth,
+                          (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
+    }
+  }
+
+  return MaxWidth;
 }
 
 unsigned
 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
-                                               unsigned UserUF) {
+                                               unsigned UserUF,
+                                               unsigned VF,
+                                               unsigned LoopCost) {
+
+  // -- The unroll heuristics --
+  // We unroll the loop in order to expose ILP and reduce the loop overhead.
+  // There are many micro-architectural considerations that we can't predict
+  // at this level. For example frontend pressure (on decode or fetch) due to
+  // code size, or the number and capabilities of the execution ports.
+  //
+  // We use the following heuristics to select the unroll factor:
+  // 1. If the code has reductions the we unroll in order to break the cross
+  // iteration dependency.
+  // 2. If the loop is really small then we unroll in order to reduce the loop
+  // overhead.
+  // 3. We don't unroll if we think that we will spill registers to memory due
+  // to the increased register pressure.
+
   // Use the user preference, unless 'auto' is selected.
   if (UserUF != 0)
     return UserUF;
@@ -2710,17 +2993,39 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   // fit without causing spills.
   unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
 
-  // We don't want to unroll the loops to the point where they do not fit into
-  // the decoded cache. Assume that we only allow 32 IR instructions.
-  UF = std::min(UF, (MaxLoopSizeThreshold / R.NumInstructions));
-
   // Clamp the unroll factor ranges to reasonable factors.
+  unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
+
+  // If we did not calculate the cost for VF (because the user selected the VF)
+  // then we calculate the cost of VF here.
+  if (LoopCost == 0)
+    LoopCost = expectedCost(VF);
+
+  // Clamp the calculated UF to be between the 1 and the max unroll factor
+  // that the target allows.
   if (UF > MaxUnrollSize)
     UF = MaxUnrollSize;
   else if (UF < 1)
     UF = 1;
 
-  return UF;
+  if (Legal->getReductionVars()->size()) {
+    DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+    return UF;
+  }
+
+  // We want to unroll tiny loops in order to reduce the loop overhead.
+  // We assume that the cost overhead is 1 and we use the cost model
+  // to estimate the cost of the loop and unroll until the cost of the
+  // loop overhead is about 5% of the cost of the loop.
+  DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
+  if (LoopCost < 20) {
+    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
+    unsigned NewUF = 20/LoopCost + 1;
+    return std::min(NewUF, UF);
+  }
+
+  DEBUG(dbgs() << "LV: Not Unrolling. \n");
+  return 1;
 }
 
 LoopVectorizationCostModel::RegisterUsage
@@ -2878,9 +3183,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
-    // We mark this instruction as zero-cost because scalar GEPs are usually
-    // lowered to the intruction addressing mode. At the moment we don't
-    // generate vector geps.
+    // We mark this instruction as zero-cost because the cost of GEPs in
+    // vectorized code depends on whether the corresponding memory instruction
+    // is scalarized or not. Therefore, we handle GEPs with the memory
+    // instruction cost.
     return 0;
   case Instruction::Br: {
     return TTI.getCFInstrCost(I->getOpcode());
@@ -2923,83 +3229,59 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     VectorTy = ToVectorTy(ValTy, VF);
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
   }
-  case Instruction::Store: {
-    StoreInst *SI = cast<StoreInst>(I);
-    Type *ValTy = SI->getValueOperand()->getType();
+  case Instruction::Store:
+  case Instruction::Load: {
+    StoreInst *SI = dyn_cast<StoreInst>(I);
+    LoadInst *LI = dyn_cast<LoadInst>(I);
+    Type *ValTy = (SI ? SI->getValueOperand()->getType() :
+                   LI->getType());
     VectorTy = ToVectorTy(ValTy, VF);
 
+    unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
+    unsigned AS = SI ? SI->getPointerAddressSpace() :
+      LI->getPointerAddressSpace();
+    Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
+    // We add the cost of address computation here instead of with the gep
+    // instruction because only here we know whether the operation is
+    // scalarized.
     if (VF == 1)
-      return TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
-                                   SI->getAlignment(),
-                                   SI->getPointerAddressSpace());
+      return TTI.getAddressComputationCost(VectorTy) +
+        TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
-    // Scalarized stores.
-    int Stride = Legal->isConsecutivePtr(SI->getPointerOperand());
+    // Scalarized loads/stores.
+    int Stride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = Stride < 0;
     if (0 == Stride) {
       unsigned Cost = 0;
-
       // The cost of extracting from the value vector and pointer vector.
-      Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+      Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
       for (unsigned i = 0; i < VF; ++i) {
-        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       i);
+        //  The cost of extracting the pointer operand.
         Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
+        // In case of STORE, the cost of ExtractElement from the vector.
+        // In case of LOAD, the cost of InsertElement into the returned
+        // vector.
+        Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
+                                            Instruction::InsertElement,
+                                            VectorTy, i);
       }
 
-      // The cost of the scalar stores.
+      // The cost of the scalar loads/stores.
+      Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
       Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                       SI->getAlignment(),
-                                       SI->getPointerAddressSpace());
+                                       Alignment, AS);
       return Cost;
     }
 
-    // Wide stores.
-    unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
-                                        SI->getAlignment(),
-                                        SI->getPointerAddressSpace());
+    // Wide load/stores.
+    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
     if (Reverse)
       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
                                   VectorTy, 0);
     return Cost;
   }
-  case Instruction::Load: {
-    LoadInst *LI = cast<LoadInst>(I);
-
-    if (VF == 1)
-      return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
-                                 LI->getPointerAddressSpace());
-
-    // Scalarized loads.
-    int Stride = Legal->isConsecutivePtr(LI->getPointerOperand());
-    bool Reverse = Stride < 0;
-    if (0 == Stride) {
-      unsigned Cost = 0;
-      Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-
-      // The cost of extracting from the pointer vector.
-      for (unsigned i = 0; i < VF; ++i)
-        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
-
-      // The cost of inserting data to the result vector.
-      for (unsigned i = 0; i < VF; ++i)
-        Cost += TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy, i);
-
-      // The cost of the scalar stores.
-      Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(),
-                                       LI->getAlignment(),
-                                       LI->getPointerAddressSpace());
-      return Cost;
-    }
-
-    // Wide loads.
-    unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
-                                        LI->getAlignment(),
-                                        LI->getPointerAddressSpace());
-    if (Reverse)
-      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
-    return Cost;
-  }
   case Instruction::ZExt:
   case Instruction::SExt:
   case Instruction::FPToUI:
@@ -3077,4 +3359,14 @@ namespace llvm {
   }
 }
 
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+  // Check for a store.
+  if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
+    return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
+
+  // Check for a load.
+  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
 
+  return false;
+}